提交 366ebb93 编写于 作者: S sneaxiy

test=develop

...@@ -28,3 +28,4 @@ third_party/ ...@@ -28,3 +28,4 @@ third_party/
build_* build_*
# clion workspace. # clion workspace.
cmake-build-* cmake-build-*
model_test
...@@ -62,13 +62,12 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) ...@@ -62,13 +62,12 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON) option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
...@@ -179,6 +178,7 @@ include(external/eigen) # download eigen3 ...@@ -179,6 +178,7 @@ include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/cares) include(external/cares)
include(external/cub) include(external/cub)
include(external/xxhash) # download xxhash
if (NOT WIN32) if (NOT WIN32)
# there is no official support of snappystream, warpctc, nccl, cupti in windows # there is no official support of snappystream, warpctc, nccl, cupti in windows
...@@ -301,3 +301,11 @@ if(WITH_DOC) ...@@ -301,3 +301,11 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED) find_python_module(recommonmark REQUIRED)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
...@@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \ ...@@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 install -U docopt PyYAML sphinx==1.5.6 && \
pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
easy_install -U pip && \ easy_install -U pip && \
pip install -U wheel && \ pip install -U pip setuptools wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip3 install pre-commit 'ipython==5.3.0' && \ RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip3 install opencv-python && \ pip3 install opencv-python && \
pip install pre-commit 'ipython==5.3.0' && \ pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python pip install opencv-python
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. ...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0) ### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
...@@ -27,9 +27,9 @@ pip install paddlepaddle ...@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.0.1.post87 pip install paddlepaddle-gpu==1.1.0.post87
# Linux GPU cuda8cudnn5 # Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.0.1.post85 pip install paddlepaddle-gpu==1.1.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/ # For installation on other platform, refer to http://paddlepaddle.org/
``` ```
...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85 ...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
## Installation ## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website. It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
## Documentation ## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation. [Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html) - [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html) - [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html) - [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions! We appreciate your contributions!
......
...@@ -142,5 +142,10 @@ def parse_args(): ...@@ -142,5 +142,10 @@ def parse_args():
choices=['reduce', 'all_reduce'], choices=['reduce', 'all_reduce'],
default='all_reduce', default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce') help='Specify the reduce strategy, can be reduce, all_reduce')
parser.add_argument(
'--fuse_broadcast_op',
action='store_true',
help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else: else:
build_strategy.reduce_strategy = fluid.BuildStrategy( build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce ).ReduceStrategy.AllReduce
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
avg_loss = train_args[0] avg_loss = train_args[0]
...@@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
if args.use_fake_data or args.use_reader_op: if args.use_fake_data or args.use_reader_op:
try: try:
fetch_ret = exe.run(fetch_list) fetch_ret = exe.run(fetch_list)
except fluid.core.EOFException as eof: except fluid.core.EOFException as eof:
break break
......
INCLUDE(ExternalProject)
set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
IF(WITH_STATIC_LIB)
SET(BUILD_CMD make lib)
ELSE()
IF(APPLE)
SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
ELSE(APPLE)
SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
ENDIF(APPLE)
ENDIF()
ExternalProject_Add(
extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5"
PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
TEST_COMMAND ""
)
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
IF(WITH_C_API)
INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
IF(ANDROID)
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
ENDIF()
ENDIF()
...@@ -31,7 +31,7 @@ function(copy TARGET) ...@@ -31,7 +31,7 @@ function(copy TARGET)
foreach(index RANGE ${len}) foreach(index RANGE ${len})
list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst) list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}" COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}" COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}") COMMENT "copying ${src} -> ${dst}")
...@@ -67,6 +67,13 @@ copy(boost_lib ...@@ -67,6 +67,13 @@ copy(boost_lib
DEPS boost DEPS boost
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash
)
if(NOT PROTOBUF_FOUND) if(NOT PROTOBUF_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib copy(protobuf_lib
...@@ -186,7 +193,7 @@ copy(cmake_cache ...@@ -186,7 +193,7 @@ copy(cmake_cache
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR})
# This command generates a complete fluid library for both train and inference # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library # Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
......
...@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY) ...@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
endif() endif()
add_subdirectory(testing) add_subdirectory(testing)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API) if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
add_subdirectory(fluid) add_subdirectory(fluid)
endif() endif()
...@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', ' ...@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
...@@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name' ...@@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
...@@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', ...@@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label',
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
...@@ -174,7 +174,12 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -174,7 +174,12 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...@@ -353,6 +358,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind ...@@ -353,6 +358,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
......
...@@ -9,8 +9,6 @@ add_subdirectory(pybind) ...@@ -9,8 +9,6 @@ add_subdirectory(pybind)
add_subdirectory(recordio) add_subdirectory(recordio)
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last.
# NOTE: please add subdirectory inference at last. add_subdirectory(inference)
add_subdirectory(inference) add_subdirectory(train)
add_subdirectory(train)
endif()
...@@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { ...@@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
case proto::AttrType::LONG: { case proto::AttrType::LONG: {
return attr_desc.l(); return attr_desc.l();
} }
case proto::AttrType::LONGS: {
std::vector<int64_t> val(attr_desc.longs_size());
for (int i = 0; i < attr_desc.longs_size(); ++i) {
val[i] = attr_desc.longs(i);
}
return val;
}
default: default:
PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
} }
......
...@@ -26,6 +26,113 @@ limitations under the License. */ ...@@ -26,6 +26,113 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T>
struct ExtractAttribute {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
T* operator()(Attribute& attr) const {
T* attr_value = nullptr;
try {
attr_value = &boost::get<T>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
attr_name_, paddle::platform::demangle(typeid(T).name()),
paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
// special handle bool
// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
// hard to change the logic there. In another way, we should correct handle
// if the user set `some_flag=1`.
//
// FIX ME anytime if there is a better solution.
template <>
struct ExtractAttribute<bool> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
bool* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<bool>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
float val = boost::get<float>(attr);
attr = static_cast<bool>(val);
}
bool* attr_value = nullptr;
try {
attr_value = &boost::get<bool>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
attr_name_, paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
template <>
struct ExtractAttribute<int64_t> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
int64_t* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<int64_t>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
int val = boost::get<float>(attr);
attr = static_cast<int64_t>(val);
}
int64_t* attr_value = nullptr;
try {
attr_value = &boost::get<int64_t>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
template <>
struct ExtractAttribute<std::vector<int64_t>> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
std::vector<int64_t>* operator()(Attribute& attr) const {
if (attr.type() == typeid(std::vector<int>)) { // NOLINT
std::vector<int> val = boost::get<std::vector<int>>(attr);
std::vector<int64_t> vec(val.begin(), val.end());
attr = vec;
} else if (attr.type() == typeid(std::vector<float>)) { // NOLINT
std::vector<float> val = boost::get<std::vector<float>>(attr);
std::vector<int64_t> vec(val.begin(), val.end());
attr = vec;
}
std::vector<int64_t>* attr_value = nullptr;
try {
attr_value = &boost::get<std::vector<int64_t>>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
template <typename T> template <typename T>
inline proto::AttrType AttrTypeID() { inline proto::AttrType AttrTypeID() {
Attribute tmp = T(); Attribute tmp = T();
...@@ -42,7 +149,11 @@ class AttrReader { ...@@ -42,7 +149,11 @@ class AttrReader {
inline const T& Get(const std::string& name) const { inline const T& Get(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
name); name);
return boost::get<T>(attrs_.at(name));
Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
ExtractAttribute<T> extract_attr(name);
T* attr_value = extract_attr(attr);
return *attr_value;
} }
private: private:
...@@ -82,7 +193,7 @@ class DefaultValueSetter { ...@@ -82,7 +193,7 @@ class DefaultValueSetter {
public: public:
explicit DefaultValueSetter(T default_value) explicit DefaultValueSetter(T default_value)
: default_value_(default_value) {} : default_value_(default_value) {}
void operator()(T& value) const { value = default_value_; } void operator()(T& value) const { value = default_value_; } // NOLINT
private: private:
T default_value_; T default_value_;
...@@ -117,84 +228,6 @@ class EnumInContainer { ...@@ -117,84 +228,6 @@ class EnumInContainer {
std::unordered_set<T> container_; std::unordered_set<T> container_;
}; };
template <typename T>
struct ExtractAttribute {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
T* operator()(Attribute& attr) const {
T* attr_value = nullptr;
try {
attr_value = &boost::get<T>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
attr_name_, paddle::platform::demangle(typeid(T).name()),
paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
// special handle bool
// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
// hard to change the logic there. In another way, we should correct handle
// if the user set `some_flag=1`.
//
// FIX ME anytime if there is a better solution.
template <>
struct ExtractAttribute<bool> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
bool* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<bool>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
float val = boost::get<float>(attr);
attr = static_cast<bool>(val);
}
bool* attr_value = nullptr;
try {
attr_value = &boost::get<bool>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
attr_name_, paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
template <>
struct ExtractAttribute<int64_t> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
int64_t* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<int64_t>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
int val = boost::get<float>(attr);
attr = static_cast<int64_t>(val);
}
int64_t* attr_value = nullptr;
try {
attr_value = &boost::get<int64_t>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, paddle::platform::demangle(attr.type().name()));
}
return attr_value;
}
const std::string& attr_name_;
};
// check whether a certain attribute fit its limits // check whether a certain attribute fit its limits
// an attribute can have more than one limits // an attribute can have more than one limits
template <typename T> template <typename T>
...@@ -235,7 +268,7 @@ class TypedAttrChecker { ...@@ -235,7 +268,7 @@ class TypedAttrChecker {
return *this; return *this;
} }
void operator()(AttributeMap& attr_map) const { void operator()(AttributeMap& attr_map) const { // NOLINT
if (!attr_map.count(attr_name_)) { if (!attr_map.count(attr_name_)) {
// user do not set this attr // user do not set this attr
PADDLE_ENFORCE(!default_value_setter_.empty(), PADDLE_ENFORCE(!default_value_setter_.empty(),
...@@ -271,7 +304,7 @@ class OpAttrChecker { ...@@ -271,7 +304,7 @@ class OpAttrChecker {
return *(checker.target<TypedAttrChecker<T>>()); return *(checker.target<TypedAttrChecker<T>>());
} }
void Check(AttributeMap& attr_map) const { void Check(AttributeMap& attr_map) const { // NOLINT
for (const auto& checker : attr_checkers_) { for (const auto& checker : attr_checkers_) {
checker(attr_map); checker(attr_map);
} }
......
...@@ -16,12 +16,14 @@ if(WITH_GPU) ...@@ -16,12 +16,14 @@ if(WITH_GPU)
dynload_cuda variable_visitor) dynload_cuda variable_visitor)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
else() else()
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor) variable_visitor)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
endif() endif()
cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor) cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
...@@ -34,7 +36,7 @@ if(WITH_GPU) ...@@ -34,7 +36,7 @@ if(WITH_GPU)
endif() endif()
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
if(WITH_GPU) if(WITH_GPU)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
...@@ -54,8 +56,9 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu ...@@ -54,8 +56,9 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
cc_library(build_strategy SRCS build_strategy.cc DEPS cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass) fuse_elewise_add_act_pass multi_batch_merge_pass)
...@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_(ctxs) { nccl_ctxs_(ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p : places_) { for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
} }
} }
} }
...@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
...@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { ...@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>(); *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
auto &p = places_[i]; auto &p = places_[i];
auto *var = scope.FindVar(out_var_handles[i]->name_); auto *var = scope.FindVar(out_var_handles[i]->name_);
auto *dev_ctx = dev_ctxes_[p]; auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>(); auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
......
...@@ -48,16 +48,27 @@ void BroadcastOpHandle::RunImpl() { ...@@ -48,16 +48,27 @@ void BroadcastOpHandle::RunImpl() {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>()); var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
} }
BroadcastOneVar(*in_var_handle, out_var_handles, var_scopes);
}
void BroadcastOpHandle::BroadcastOneVar(
const VarHandle &in_var_handle,
const std::vector<VarHandle *> &out_var_handles,
const std::vector<const Scope *> &var_scopes) {
auto *in_var = auto *in_var =
var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_); var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(in_var);
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
return;
}
InitOutputValue(*in_var_handle, out_var_handles); InitOutputValue(in_var_handle, out_var_handles);
if (platform::is_cpu_place(in_tensor.place())) { if (platform::is_cpu_place(in_tensor.place())) {
for (auto *out_var_handle : out_var_handles) { for (auto *out_var_handle : out_var_handles) {
if (out_var_handle->IsTheSameVar(*in_var_handle)) { if (out_var_handle->IsTheSameVar(in_var_handle)) {
continue; continue;
} }
auto &out_p = out_var_handle->place_; auto &out_p = out_var_handle->place_;
...@@ -114,12 +125,12 @@ void BroadcastOpHandle::RunImpl() { ...@@ -114,12 +125,12 @@ void BroadcastOpHandle::RunImpl() {
} }
} }
if (!out_handle->IsTheSameVar(*in_var_handle)) { if (!out_handle->IsTheSameVar(in_var_handle)) {
auto out_var = var_scopes.at(in_var_handle->scope_idx_) auto out_var = var_scopes.at(in_var_handle.scope_idx_)
->FindVar(out_var_handles[0]->name_); ->FindVar(out_var_handles[0]->name_);
paddle::framework::TensorCopy( paddle::framework::TensorCopy(
in_tensor, in_var_handle->place_, in_tensor, in_var_handle.place_,
*(dev_ctxes_.at(in_var_handle->place_)), *(dev_ctxes_.at(in_var_handle.place_)),
&VariableVisitor::GetMutableTensor(out_var)); &VariableVisitor::GetMutableTensor(out_var));
} }
}); });
......
...@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) { nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) { for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
} }
} }
} }
...@@ -61,7 +62,10 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -61,7 +62,10 @@ struct BroadcastOpHandle : public OpHandleBase {
protected: protected:
void RunImpl() override; void RunImpl() override;
private: void BroadcastOneVar(const VarHandle &in_var_handle,
const std::vector<VarHandle *> &out_var_handles,
const std::vector<const Scope *> &var_scopes);
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -12,232 +12,12 @@ ...@@ -12,232 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> gpu_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
}
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
gpu_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_.reset(nullptr);
#endif
}
}
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
param_scopes_[input_scope_idx]->Var("input");
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
op_handle_.reset(
new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
#endif
}
std::unique_ptr<ir::Node> v =
ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
gpu_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
std::unique_ptr<ir::Node> v2 =
ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v2.get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) {
if (!use_gpu_) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
}
std::unique_ptr<ir::Node> v3 =
ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
std::unique_ptr<ir::Node> v4 =
ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v4.get()));
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddOutput(out_dummy_var_handle);
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
f::LoD lod{{0, 10, 20}};
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
in_lod_tensor->set_lod(lod);
in_lod_tensor->Resize(kDims);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto out_tensor = out_var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
int height = static_cast<int>(kDims[0]) * 2;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
in_selected_rows->set_height(height);
in_selected_rows->set_rows(rows);
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
op_handle_->Run(false);
WaitAll();
p::CPUPlace cpu_place;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
auto out_var = param_scopes_[j]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var);
auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
"height is not equal.");
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
}
f::Tensor result_tensor;
f::TensorCopySync(rt, cpu_place, &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
}
};
TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op; TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0; size_t input_scope_idx = 0;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
namespace f = paddle::framework;
namespace p = paddle::platform;
// test data amount
const f::DDim kDims = {20, 20};
struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
std::vector<Scope*> local_scopes_;
std::vector<Scope*> param_scopes_;
Scope g_scope_;
std::unique_ptr<OpHandleBase> op_handle_;
std::vector<std::unique_ptr<VarHandleBase>> vars_;
std::vector<p::Place> place_list_;
bool use_gpu_;
#ifdef PADDLE_WITH_CUDA
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
void WaitAll() {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#ifdef PADDLE_WITH_CUDA
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
#endif
}
void InitCtxOnGpu(bool use_gpu) {
use_gpu_ = use_gpu;
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
place_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
int count = 8;
for (int i = 0; i < count; ++i) {
auto p = p::CPUPlace();
place_list_.push_back(p);
ctxs_.emplace_back(new p::CPUDeviceContext(p));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_.reset(nullptr);
#endif
}
}
void InitBroadcastOp(size_t input_scope_idx) {
for (size_t j = 0; j < place_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
local_scope.Var("out");
param_scopes_.emplace_back(&local_scope);
}
param_scopes_[input_scope_idx]->Var("input");
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
place_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
place_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(
new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
#endif
}
std::unique_ptr<ir::Node> v =
ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
place_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
std::unique_ptr<ir::Node> v2 =
ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v2.get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < place_list_.size(); ++j) {
if (!use_gpu_) {
op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
}
std::unique_ptr<ir::Node> v3 =
ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
std::unique_ptr<ir::Node> v4 =
ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
vars_.emplace_back(new DummyVarHandle(v4.get()));
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddOutput(out_dummy_var_handle);
}
std::vector<float> InitLoDTensor(const std::string& varname,
size_t input_scope_idx, const f::LoD& lod,
float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k + val_scalar;
}
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), lod_tensor);
lod_tensor->set_lod(lod);
lod_tensor->Resize(kDims);
return send_vector;
}
std::vector<float> InitSelectedRows(const std::string& varname,
size_t input_scope_idx,
const std::vector<int64_t>& rows,
int height, float value_scalar = 0.0) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) {
send_vector[k] = k + value_scalar;
}
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
selected_rows->set_height(height);
selected_rows->set_rows(rows);
paddle::framework::TensorFromVector<float>(
send_vector, *(ctxs_[input_scope_idx]), value);
return send_vector;
}
void SelectedRowsEqual(const std::string& varname, int input_scope_idx,
const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
}
p::CPUPlace cpu_place;
f::Tensor result_tensor;
f::TensorCopySync(rt, cpu_place, &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
}
}
void LoDTensorEqual(const std::string& varname,
const std::vector<float>& send_vec, const f::LoD& lod,
framework::Scope* scope) {
p::CPUPlace cpu_place;
auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var);
auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t k = 0; k < f::product(kDims); ++k) {
ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
}
}
void TestBroadcastLodTensor(size_t input_scope_idx) {
f::LoD lod{{0, 10, 20}};
auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
op_handle_->Run(false);
WaitAll();
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual("out", send_vector, lod, param_scopes_[j]);
}
}
void TestBroadcastSelectedRows(size_t input_scope_idx) {
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2);
auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
op_handle_->Run(false);
WaitAll();
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height);
}
}
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -121,6 +121,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -121,6 +121,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass); USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
...@@ -69,6 +69,8 @@ struct BuildStrategy { ...@@ -69,6 +69,8 @@ struct BuildStrategy {
bool enable_data_balance_{false}; bool enable_data_balance_{false};
bool fuse_broadcast_op_{false};
// User normally doesn't need to call this API. // User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes // The PassBuilder allows for more customized insert, remove of passes
// from python side. // from python side.
......
...@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() { ...@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait = bool need_wait =
in_var && in_var->GeneratedOp() && in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_]; in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
return need_wait; return need_wait;
} }
......
...@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle( ...@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) { : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) { if (ctxs) {
for (auto &p : places_) { for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p); this->SetDeviceContext(p, ctxs->DevCtx(p));
} }
} }
} }
...@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() { ...@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1, PADDLE_ENFORCE_GT(places_.size(), 1,
"Data balance can only be enabled when the number of " "Data balance can only be enabled when the number of "
"places to run larger than 1."); "places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
......
...@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
size_t num_complete = 0; size_t num_complete = 0;
remaining_ = 0; remaining_ = 0;
BlockingQueue<size_t> complete_q; auto complete_q = std::make_shared<BlockingQueue<size_t>>();
for (auto op : bootstrap_ops_) { for (auto op : bootstrap_ops_) {
RunOpAsync(op_deps.get(), op, &complete_q); RunOpAsync(op_deps.get(), op, complete_q);
} }
while (num_complete != op_deps->size()) { while (num_complete != op_deps->size()) {
size_t num_comp = complete_q.Pop(); size_t num_comp = complete_q->Pop();
if (num_comp == -1UL) { if (num_comp == -1UL) {
int remaining = 0; int remaining = 0;
while (true) { while (true) {
...@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
break; break;
} }
for (int i = 0; i < remaining; ++i) { for (int i = 0; i < remaining; ++i) {
complete_q.Pop(); complete_q->Pop();
} }
} }
exception_.ReThrow(); exception_.ReThrow();
...@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
} }
void FastThreadedSSAGraphExecutor::RunOpAsync( void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q) { OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
++remaining_; ++remaining_;
this->pool_.enqueue([=] { this->pool_.enqueue([=] {
OpHandleBase *op_to_run = op; OpHandleBase *op_to_run = op;
...@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
if (op_to_run == nullptr) { if (op_to_run == nullptr) {
op_to_run = pending_op; op_to_run = pending_op;
} else { } else {
this->RunOpAsync(op_deps, pending_op, complete_q); RunOpAsync(op_deps, pending_op, complete_q);
} }
} }
} }
...@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
} }
void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
atomic_op_deps_ = pool_.enqueue([&] { atomic_op_deps_ = pool_.enqueue([&] {
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps = auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
new std::unordered_map<OpHandleBase *, std::atomic<int>>;
for (auto &pair : op_deps_) { for (auto &pair : op_deps_) {
(*op_deps)[pair.first] = pair.second; (*op_deps)[pair.first] = pair.second;
} }
......
...@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::atomic<int> remaining_; std::atomic<int> remaining_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps, void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q); OpHandleBase *op,
const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
void PrepareAtomicOpDeps(); void PrepareAtomicOpDeps();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1UL) return;
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
WaitInputVarGenerated();
std::vector<const Scope *> var_scopes;
for (auto *s : local_scopes_) {
var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
}
size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar(
*in_var_handles[i],
std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
out_var_handles.begin() + (i + 1) * place_num),
var_scopes);
}
}
std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace framework {
namespace details {
struct FusedBroadcastOpHandle : public BroadcastOpHandle {
public:
#ifdef PADDLE_WITH_CUDA
FusedBroadcastOpHandle(ir::Node *node,
const std::vector<Scope *> local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctx)
: BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
#else
FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
const std::vector<platform::Place>& places)
: BroadcastOpHandle(node, local_scopes, places) {}
#endif
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace paddle {
namespace framework {
namespace details {
struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::string> out_varnames_;
void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
// initialize scope and var
for (size_t i = 0; i < place_list_.size(); ++i) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
*local_scopes_.back()
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
local_scope.Var("out_var" + j);
if (i == j) local_scope.Var("in_var" + j);
}
param_scopes_.emplace_back(&local_scope);
}
// create op handle node
std::unique_ptr<ir::Node> n =
ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new FusedBroadcastOpHandle(
n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not supported.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(new FusedBroadcastOpHandle(
n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(
new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
#endif
}
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle
std::unique_ptr<ir::Node> in_node =
ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
VarHandle* in_var_handle =
new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) {
std::unique_ptr<ir::Node> out_node =
ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
VarHandle* out_var_handle =
new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
}
}
void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
std::vector<std::vector<float>> send_vec;
f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
float val_scalar = static_cast<float>(i);
send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
}
op_handle_->Run(false);
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
}
}
}
void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
std::vector<std::vector<float>> send_vector;
std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
float val_scalar = static_cast<float>(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar));
}
op_handle_->Run(false);
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height);
}
}
}
};
TEST(FusedBroadcastTester, CPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
TEST(FusedBroadcastTester, CPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(false);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#ifdef PADDLE_WITH_CUDA
TEST(FusedBroadcastTester, GPULodTensor) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
}
TEST(FusedBroadcastTester, GPUSelectedRows) {
TestFusedBroadcastOpHandle test_op;
std::vector<size_t> input_scope_idxes = {0, 1};
test_op.InitCtxOnGpu(true);
test_op.InitFusedBroadcastOp(input_scope_idxes);
test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
}
#endif
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() { ...@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
"The number of output should be one."); "The number of output should be one.");
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
...@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() { ...@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor *out_tensor = out_value->mutable_value(); Tensor *out_tensor = out_value->mutable_value();
// copy // copy
auto dev_ctx = dev_ctxes_[out_var_handle->place_]; auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx, RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
t_out_p] { t_out_p] {
int s = 0, e = 0; int s = 0, e = 0;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h" #include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h"
...@@ -347,7 +348,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -347,7 +348,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
BuildStrategy::GradientScaleStrategy::kCustomized) { BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle? // TODO(paddle-dev): Why is there no input for this op_handle?
auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name); CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
} }
// This assumes the backward generating code will ensure IsScaleLossOp // This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss. // is true only for the op that scale the final scalar loss.
...@@ -436,10 +437,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -436,10 +437,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
if ((use_gpu && if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) { is_dist_train) {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { if (strategy_.fuse_broadcast_op_) {
auto &to_bcast_set = bcast_var_name_set[dev_id]; CreateFusedBroadcastOp(&result, bcast_var_name_set);
for (auto &bcast_name : to_bcast_set) { } else {
CreateBroadcastOp(&result, bcast_name, dev_id); for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) {
CreateBroadcastOp(&result, bcast_name, dev_id);
}
} }
} }
} }
...@@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, ...@@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
} }
} }
void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
#ifdef PADDLE_WITH_CUDA
auto *op_handle = new FusedBroadcastOpHandle(
result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_);
#else
auto *op_handle = new FusedBroadcastOpHandle(
result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
local_scopes_, places_);
#endif
result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
}
for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
for (auto &p_name : bcast_varnames[dev_id]) {
auto *in =
result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back().get();
op_handle->AddInput(in);
for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
auto &p = places_[out_dev_id];
auto &vars =
result->Get<GraphVars>(kGraphVars).at(out_dev_id).at(p_name);
auto *out_var = new VarHandle(
result->CreateEmptyNode(p_name, ir::Node::Type::kVariable),
vars.size(), out_dev_id, p_name, p);
vars.emplace_back(out_var);
op_handle->AddOutput(out_var);
}
}
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
ir::Node *node, ir::Node *node,
int dev_id) const { int dev_id) const {
...@@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, ...@@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
} }
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name) const { ir::Graph *result, const std::string &loss_grad_name,
ir::Node *out_var_node) const {
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle // Insert ScaleCost OpHandle
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
...@@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ...@@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
// loss->pending_ops_.emplace_back(op_handle); // loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss); // op_handle->inputs_.emplace_back(loss);
CreateOpOutput( CreateOpOutput(result, op_handle,
result, op_handle, result->CreateVarNode(out_var_node->Var()), places_[i], i);
result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
places_[i], i);
} }
} }
...@@ -680,7 +722,8 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -680,7 +722,8 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
} }
if (node->Op()->Type() == "split_byref" || if (node->Op()->Type() == "split_byref" ||
node->Op()->Type() == "split_selected_rows") { node->Op()->Type() == "split_selected_rows" ||
node->Op()->Type() == "split_ids") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
op_dev_id = GetVarDeviceID(*result, input_var_names[0]); op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
......
...@@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name) const; const std::string &loss_grad_name,
ir::Node *out_var_node) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
...@@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
void CreateFusedBroadcastOp(
ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
bool IsSparseGradient(const std::string &og) const; bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID( size_t GetAppropriateDeviceID(
......
...@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) { for (auto *in : inputs_) {
if (NeedWait(in)) { if (NeedWait(in)) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]); in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
} }
} }
} }
......
...@@ -27,7 +27,7 @@ namespace framework { ...@@ -27,7 +27,7 @@ namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
......
...@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase { ...@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_(nccl_ctxs) { nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) { if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) { for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
p_ctx.second.ctx_.get());
} }
} }
} }
......
...@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() { ...@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue; continue;
} }
if (in->GeneratedOp()) { if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]); in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
} }
} }
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
...@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, ...@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_(static_cast<float>(1.0 / num_dev)), coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope), scope_(scope),
place_(place) { place_(place) {
dev_ctxes_[place_] = dev_ctx; this->SetDeviceContext(place_, dev_ctx);
} }
ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
...@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
auto stream = auto stream = static_cast<platform::CUDADeviceContext *>(
static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_]) this->dev_ctxes_.at(place_))
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(10) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
......
...@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars; auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
std::unordered_set<OpHandleBase *> ready_ops; std::unordered_set<OpHandleBase *> ready_ops;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule // streams from multiple GPUs, it's faster to buffer them and schedule
...@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
for (auto &name_pair : var_map) { for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) { for (auto &version_pair : name_pair.second) {
InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
} }
} }
} }
for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) { for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
InsertPendingVar(&pending_vars, &ready_vars, var.get()); InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
} }
for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) { for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
...@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
FeedFetchList fetch_data(fetch_tensors.size()); FeedFetchList fetch_data(fetch_tensors.size());
InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
&pending_vars, &ready_vars, &fetch_data); &pending_vars, ready_vars.get(), &fetch_data);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) { auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) { for (auto *op : set) {
running_ops_++; running_ops_++;
RunOp(&ready_vars, op); RunOp(ready_vars, op);
} }
set.clear(); set.clear();
}; };
...@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
run_op_futures_.clear(); run_op_futures_.clear();
exception_holder_.Clear(); exception_holder_.Clear();
event.reset(nullptr); event.reset(nullptr);
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
// 1. Run All Ready ops // 1. Run All Ready ops
...@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// 2. Find ready variable // 2. Find ready variable
bool timeout; bool timeout;
auto cur_ready_vars = ready_vars.PopAll(1, &timeout); auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
if (timeout) { if (timeout) {
if (exception_holder_.IsCaught()) { if (exception_holder_.IsCaught()) {
...@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
} }
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(ready_ops.empty());
// Wait FetchOps. // Wait FetchOps.
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_.get(), &fetch_ops);
...@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ...@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
} }
void ThreadedSSAGraphExecutor::RunOp( void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) { const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] { auto op_run = [ready_var_q, op, this] {
try { try {
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
......
...@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~ThreadedSSAGraphExecutor() {} ~ThreadedSSAGraphExecutor() {}
private: private:
void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q, void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
details::OpHandleBase *op); details::OpHandleBase *op);
private: private:
......
...@@ -35,6 +35,7 @@ enum AttrType { ...@@ -35,6 +35,7 @@ enum AttrType {
BLOCK = 8; BLOCK = 8;
LONG = 9; LONG = 9;
BLOCKS = 10; BLOCKS = 10;
LONGS = 11;
} }
// OpDesc describes an instance of a C++ framework::OperatorBase // OpDesc describes an instance of a C++ framework::OperatorBase
...@@ -55,6 +56,7 @@ message OpDesc { ...@@ -55,6 +56,7 @@ message OpDesc {
optional int32 block_idx = 12; optional int32 block_idx = 12;
optional int64 l = 13; optional int64 l = 13;
repeated int32 blocks_idx = 14; repeated int32 blocks_idx = 14;
repeated int64 longs = 15;
}; };
message Var { message Var {
......
...@@ -36,10 +36,12 @@ pass_library(fc_lstm_fuse_pass inference) ...@@ -36,10 +36,12 @@ pass_library(fc_lstm_fuse_pass inference)
pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(embedding_fc_lstm_fuse_pass inference)
pass_library(fc_gru_fuse_pass inference) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base)
pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_bias_mkldnn_fuse_pass inference)
pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
...@@ -58,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph ...@@ -58,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
...@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
virtual ~ConvReLUFusePass() {} virtual ~ConvReLUFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, ...@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
} }
// a->OP0->b // a->OP0->b
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
#define GET_NODE(id, pattern) \
PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(pattern.RetrieveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
pattern->NewNode("depthwise_conv")
->assert_is_op("depthwise_conv2d")
->assert_op_attr("use_mkldnn", true);
int found_depthwise_conv_mkldnn_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
GET_NODE(depthwise_conv, (*pattern));
depthwise_conv->Op()->SetType("conv2d");
found_depthwise_conv_mkldnn_count++;
};
gpd(graph.get(), handler);
AddStatis(found_depthwise_conv_mkldnn_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(depthwise_conv_mkldnn_pass,
paddle::framework::ir::DepthwiseConvMKLDNNPass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace paddle {
namespace framework {
namespace ir {
class DepthwiseConvMKLDNNPass : public FusePassBase {
public:
virtual ~DepthwiseConvMKLDNNPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn = false) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("use_mkldnn", use_mkldnn);
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]});
op->SetOutput("Out", outputs);
}
// (a, weights, bias)->depthwise conv mkldnn->b
// (b, weights2, bias2)->depthwise conv no mkldnn->c
// (c, weights3, bias3)->conv mkldnn->d
// (d, weights3, bias3)->conv no mkldnn->e
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v : std::vector<std::string>(
{"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
"weights3", "bias3", "weights4", "bias4"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
var->SetPersistable(true);
}
}
// depthwise conv with MKL-DNN
SetOp(&prog, "depthwise_conv2d", "conv1",
std::vector<std::string>({"a", "weights", "bias"}),
std::vector<std::string>({"b"}), true);
// depthwise conv without MKL-DNN
SetOp(&prog, "depthwise_conv2d", "conv2",
std::vector<std::string>({"b", "weights2", "bias2"}),
std::vector<std::string>({"c"}), false);
// conv with MKL-DNN
SetOp(&prog, "conv2d", "conv3",
std::vector<std::string>({"c", "weights3", "bias3"}),
std::vector<std::string>({"d"}), true);
// conv without MKL-dNN
SetOp(&prog, "conv2d", "conv4",
std::vector<std::string>({"d", "weights4", "bias4"}),
std::vector<std::string>({"e"}), false);
return prog;
}
TEST(DepthwiseConvMKLDNNPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
struct counters {
int mkldnn_depthwise_conv_nodes;
int other_depthwise_conv_nodes;
int mkldnn_conv_nodes;
int other_conv_nodes;
};
counters before{1, 1, 1, 1};
graph = pass->Apply(std::move(graph));
// initialize counters before loop
counters after{0, 0, 0, 0};
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
if (op->Type() == "conv2d") {
if (boost::get<bool>(op->GetAttr("use_mkldnn")))
after.mkldnn_conv_nodes++;
else
after.other_conv_nodes++;
} else if (op->Type() == "depthwise_conv2d") {
if (boost::get<bool>(op->GetAttr("use_mkldnn")))
after.mkldnn_depthwise_conv_nodes++;
else
after.other_depthwise_conv_nodes++;
}
}
}
EXPECT_EQ(after.other_depthwise_conv_nodes,
before.other_depthwise_conv_nodes);
EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
before.mkldnn_depthwise_conv_nodes - 1);
EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(depthwise_conv_mkldnn_pass);
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
op->SetOutput("Out", outputs); op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
} }
// a->OP0->b // a->OP0->b
......
...@@ -23,80 +23,78 @@ limitations under the License. */ ...@@ -23,80 +23,78 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) { void CheckProgram(const ProgramDesc &program) {
std::vector<std::string> send_vars; std::map<int, bool> visit;
// since parameters are all in block 0, #define _INT(role) static_cast<int>(role)
// it's enough to only scan send ops in block 0
for (auto &node : nodes) { for (size_t i = 0; i < program.Size(); ++i) {
auto op_vars = node->Op()->InputArgumentNames(); for (OpDesc *op : program.Block(i).AllOps()) {
send_vars.reserve(send_vars.size() + // For backward compatibility, some program doesn't have role added.
std::distance(op_vars.begin(), op_vars.end())); if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); int role_id = boost::get<int>(
} op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
return send_vars; visit[role_id] = true;
} switch (role_id) {
case _INT(OpRole::kForward):
std::vector<std::string> FindDistTrainRecvVars( PADDLE_ENFORCE(
const std::vector<ir::Node *> &nodes) { visit.find(_INT(OpRole::kBackward)) == visit.end(),
std::vector<std::string> recv_vars; "Cannot add forward operator before backward operator.");
for (auto &node : nodes) { break;
auto op_vars = node->Op()->OutputArgumentNames(); case _INT(OpRole::kBackward):
recv_vars.reserve(recv_vars.size() + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
std::distance(op_vars.begin(), op_vars.end())); PADDLE_ENFORCE(
recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); visit.find(_INT(OpRole::kOptimize)) == visit.end(),
} "Cannot add backward operator before optimize operator.");
return recv_vars; break;
} case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars, _INT(OpRole::kLoss)) == visit.end(),
const std::vector<std::string> &recv_vars) { "Cannot add backward|loss operator before "
if (send_vars.size() == 0 || recv_vars.size() == 0) { "forward|loss operator.");
return false; PADDLE_ENFORCE(
} visit.find(_INT(OpRole::kOptimize)) == visit.end(),
"Cannot add backward operator before optimize operator.");
/** break;
* Check any of opvars contains `.block` and in sendvars case _INT(OpRole::kOptimize):
*/ case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
auto checker = [](const std::vector<std::string> &opvars, PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
const std::vector<std::string> &rpc_vars) -> bool { "Optimize operators must follow backward operator.");
for (auto &var : opvars) { break;
// a variable name with the suffix `.block` means it's a splited case _INT(OpRole::kLRSched):
// variable by (DistributeTranspiler) case _INT(OpRole::kDist):
// [python/paddle/fluid/transpiler/distribute_transpiler.py] case _INT(OpRole::kRPC):
if (var.find(".block") != std::string::npos && case _INT(OpRole::kNotSpecified):
std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) { break;
return true; default:
LOG(FATAL) << "Unknown operator role. Don't add new role because "
"you don't know what you are doing.";
} }
} }
return false;
};
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
} }
for (ir::Node *output : node->outputs) { #undef _INT
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
} }
} // namespace
Graph::Graph(const ProgramDesc &program) : program_(program) { Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_);
// Make the nodes id start from 0. // Make the nodes id start from 0.
Node::ResetId(); Node::ResetId();
auto var_nodes = InitFromProgram(program_);
ResolveHazard(var_nodes);
}
std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
const ProgramDesc &program) {
VLOG(3) << "block in program:" << program_.Size(); VLOG(3) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars; std::unordered_map<std::string, VarDesc *> all_vars;
// var nodes for each var name, will have multiple versions in SSA
std::map<std::string, std::vector<ir::Node *>> var_nodes;
for (auto *var : program.Block(0).AllVars()) { for (auto *var : program.Block(0).AllVars()) {
all_vars.emplace(var->Name(), var); all_vars.emplace(var->Name(), var);
} }
std::map<std::string, std::vector<ir::Node *>> var_nodes;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
ir::Node *node = CreateOpNode(op); ir::Node *node = CreateOpNode(op);
// For input args, reuse the same var name if it was created before. // For input args, reuse the same var name if it was created before.
...@@ -134,7 +132,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -134,7 +132,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
var->inputs.push_back(node); var->inputs.push_back(node);
} }
} }
return std::move(var_nodes);
}
void Graph::ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes) {
/** /**
* We should handle write after read(WAR) and write after write(WAW) here. * We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly. * Because some of the operators of the program can be executed parallelly.
...@@ -153,6 +155,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -153,6 +155,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
auto it_old = versions.rbegin(); auto it_old = versions.rbegin();
++it_old; ++it_old;
for (; it_old != versions.rend(); it_new = it_old, ++it_old) { for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
VLOG(3) << "deal with var: " << (*it_new)->Name();
ir::Node *write_op = ir::Node *write_op =
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs; const auto &read_ops = (*it_old)->outputs;
......
...@@ -160,6 +160,12 @@ class Graph { ...@@ -160,6 +160,12 @@ class Graph {
return nullptr; return nullptr;
} }
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private: private:
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
......
...@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) { ...@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
std::deque<ir::Node *> q_nodes; std::deque<ir::Node *> q_nodes;
std::vector<std::unordered_set<ir::Node *>> graph_nodes; std::vector<std::unordered_set<ir::Node *>> graph_nodes;
std::unordered_set<ir::Node *> g_nodes; std::unordered_set<ir::Node *> g_nodes;
// q_set used to record records in the queue.
std::unordered_set<ir::Node *> q_set;
size_t graph_count = 0; size_t graph_count = 0;
auto traverse_nodes = [&visited_nodes, auto traverse_nodes = [&visited_nodes, &q_nodes,
&q_nodes](const std::vector<ir::Node *> &nodes) { &q_set](const std::vector<ir::Node *> &nodes) {
std::copy_if( for (auto n : nodes) {
nodes.begin(), nodes.end(), std::back_inserter(q_nodes), if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
[&visited_nodes](Node *node) { return !visited_nodes.count(node); }); q_nodes.push_back(n);
q_set.insert(n);
}
}
}; };
while (visited_nodes.size() != nodes.size()) { while (visited_nodes.size() != nodes.size()) {
if (!q_nodes.empty()) { if (!q_nodes.empty()) {
auto cur_node = q_nodes.front(); auto cur_node = q_nodes.front();
q_nodes.pop_front(); q_nodes.pop_front();
q_set.erase(cur_node);
visited_nodes.insert(cur_node); visited_nodes.insert(cur_node);
g_nodes.insert(cur_node); g_nodes.insert(cur_node);
traverse_nodes(cur_node->inputs); traverse_nodes(cur_node->inputs);
...@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) { ...@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
for (auto &n : nodes) { for (auto &n : nodes) {
if (visited_nodes.count(n) == 0) { if (visited_nodes.count(n) == 0) {
q_nodes.push_back(n); q_nodes.push_back(n);
q_set.insert(n);
break; break;
} }
} }
......
...@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() { ...@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
return result; return result;
} }
bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
const std::pair<PDNode *, Node *> &b) {
if (a.first != b.first) {
return a.first < b.first;
} else {
return a.second < b.second;
}
}
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550 // see https://github.com/PaddlePaddle/Paddle/issues/13550
void GraphPatternDetector::UniquePatterns( void GraphPatternDetector::UniquePatterns(
...@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns( ...@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
std::vector<GraphPatternDetector::subgraph_t> result; std::vector<GraphPatternDetector::subgraph_t> result;
std::unordered_set<size_t> set; std::unordered_set<size_t> set;
std::hash<std::string> hasher;
for (auto &g : *subgraphs) { for (auto &g : *subgraphs) {
size_t key = 0; // Sort the items in the sub-graph, and transform to a string key.
for (auto &item : g) { std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
key ^= std::hash<void *>{}(item.first); std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
key ^= std::hash<void *>{}(item.second); std::stringstream ss;
for (auto &item : sorted_keys) {
ss << item.first << ":" << item.second;
} }
auto key = hasher(ss.str());
if (!set.count(key)) { if (!set.count(key)) {
result.emplace_back(g); result.emplace_back(g);
set.insert(key); set.insert(key);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace ir {
static const char kNumRepeats[] = "num_repeats";
typedef std::unordered_map<std::string, std::vector<ir::Node*>> SSAVarList;
ir::Node* SameNameVar(std::unordered_set<ir::Node*> all, ir::Node* target) {
for (auto n : all) {
if (target->IsVar() && target->Name() == n->Name()) {
return n;
}
}
return nullptr;
}
VarDesc CopyVarDesc(VarDesc* var_desc) {
VarDesc repeated_var(var_desc->Name());
// copy other variable attributes
if (var_desc->GetType() != proto::VarType::READER) {
repeated_var.SetType(var_desc->GetType());
repeated_var.SetShape(var_desc->GetShape());
repeated_var.SetDataType(var_desc->GetDataType());
repeated_var.SetLoDLevel(var_desc->GetLoDLevel());
repeated_var.SetPersistable(var_desc->Persistable());
} else {
// TODO(typhoonzero): copy reader var
}
return repeated_var;
}
VarDesc UpdateGradVarDesc(
VarDesc* var_desc, int repeat,
const std::unordered_set<std::string>& grad_names,
const std::unordered_set<std::string>& bn_vars_need_rename) {
if (grad_names.find(var_desc->Name()) != grad_names.end() ||
bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) {
std::string new_gname =
string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
VarDesc repeated_var = CopyVarDesc(var_desc);
repeated_var.SetName(new_gname);
VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
return repeated_var;
}
return *var_desc;
}
std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std::unique_ptr<Graph> graph) const {
int num_repeats = Get<const int>(kNumRepeats);
std::vector<Node*> forward_backward_ops;
std::vector<Node*> optimize_ops;
std::vector<Node*> lr_ops; // ops other than forward/backward/optimize
std::unordered_set<std::string> grad_names;
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
auto origin_nodes = graph->ReleaseNodes();
VLOG(3) << "origin nodes count: " << origin_nodes.size();
ir::Graph& result = *graph;
// 1. record op nodes of different roles
for (auto node : nodes) {
if (node->IsVar()) continue;
int op_role = boost::get<int>(node->Op()->GetAttr(
framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
(op_role & static_cast<int>(framework::OpRole::kBackward)) ||
(op_role & static_cast<int>(framework::OpRole::kLoss))) {
forward_backward_ops.push_back(node);
} else if ((op_role & static_cast<int>(framework::OpRole::kOptimize)) ||
(op_role & static_cast<int>(framework::OpRole::kDist)) ||
(op_role & static_cast<int>(framework::OpRole::kRPC))) {
optimize_ops.push_back(node);
auto op_role_var = node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName());
auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
for (size_t i = 0; i < op_role_vars.size(); i += 2) {
grad_names.insert(op_role_vars[i + 1]);
}
} else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
lr_ops.push_back(node);
} else { // NOLINT
PADDLE_THROW("Invalid op_role: %d", static_cast<int>(op_role));
}
}
// 2. copy forward backward
ir::Node* prev_repeat_last_op_node = nullptr;
// record origin_grad -> repeated grad list map.
std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
std::map<std::string, std::vector<ir::Node*>> created;
std::unordered_set<std::string> bn_vars_need_rename;
for (int i = 0; i < num_repeats; ++i) {
std::unordered_set<ir::Node*> copied;
for (size_t node_idx = 0; node_idx < forward_backward_ops.size();
++node_idx) {
auto node = forward_backward_ops[node_idx];
OpDesc repeated_op(*(node->Op()), node->Op()->Block());
// 3. rename grad outputs to current repeat.
for (auto outname : repeated_op.OutputArgumentNames()) {
if (grad_names.find(outname) != grad_names.end()) {
std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
repeated_op.RenameOutput(outname, new_gname);
}
}
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update
if (node->Name() == "batch_norm") {
// NOTE: assume bn op created by layers use save var as output mean and
// variance
std::string new_mean_name =
string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i);
std::string new_var_name = string::Sprintf(
"%s.repeat.%d", repeated_op.Input("Variance")[0], i);
bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
<< new_mean_name;
repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
new_mean_name);
repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0],
new_var_name);
}
// 3.9 do copy
auto repeated_node = result.CreateOpNode(&repeated_op);
copied.insert(node);
// 4. add deps between repeats
if (node_idx == forward_backward_ops.size() - 1) {
prev_repeat_last_op_node = repeated_node;
}
if (node_idx == 0 && prev_repeat_last_op_node) {
auto* depvar = result.CreateControlDepVar();
prev_repeat_last_op_node->outputs.push_back(depvar);
depvar->inputs.push_back(prev_repeat_last_op_node);
repeated_node->inputs.push_back(depvar);
depvar->outputs.push_back(repeated_node);
}
for (auto in_node : node->inputs) {
if (in_node->IsCtrlVar()) {
continue;
}
ir::Node* var = nullptr;
auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
bn_vars_need_rename);
// should be initialized by startup, how to initilize tensor in the
// scope?
if (node->Name() == "batch_norm" &&
bn_vars_need_rename.find(in_node->Name()) !=
bn_vars_need_rename.end()) {
// Create bn mean/variance for each repeat
var = result.CreateVarNode(&updated_var);
created[updated_var.Name()].push_back(var);
copied.insert(in_node);
repeated_node->inputs.push_back(var);
var->outputs.push_back(repeated_node);
continue;
}
// for other ops
if (in_node->inputs.empty() && i > 0) {
// do not copy head vars (inputs, params) in repeats > 0
var = created.at(in_node->Name()).back();
} else {
if (copied.find(in_node) == copied.end()) {
var = result.CreateVarNode(&updated_var);
if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) {
grad_repeated_map[in_node].push_back(var);
}
copied.insert(in_node);
created[updated_var.Name()].push_back(var);
} else {
var = created.at(updated_var.Name()).back();
}
}
repeated_node->inputs.push_back(var);
var->outputs.push_back(repeated_node);
}
for (auto out_node : node->outputs) {
if (out_node->IsCtrlVar()) {
continue;
}
ir::Node* var = nullptr;
auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names,
bn_vars_need_rename);
if (copied.find(out_node) == copied.end()) {
var = result.CreateVarNode(&updated_var);
if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) {
grad_repeated_map[out_node].push_back(var);
}
copied.insert(out_node);
created[updated_var.Name()].push_back(var);
} else {
var = created.at(updated_var.Name()).back();
}
repeated_node->outputs.push_back(var);
var->inputs.push_back(repeated_node);
}
}
}
// 5. create GRAD merge op node
for (auto kv : grad_repeated_map) {
OpDesc sum_op;
sum_op.SetType("sum");
std::vector<std::string> repeated_grad_names;
for (auto r : kv.second) {
repeated_grad_names.push_back(r->Var()->Name());
}
sum_op.SetInput("X", repeated_grad_names);
sum_op.SetOutput("Out", {kv.first->Var()->Name()});
sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward));
auto sum_op_node = result.CreateOpNode(&sum_op);
for (auto r : kv.second) {
sum_op_node->inputs.push_back(r);
r->outputs.push_back(sum_op_node);
}
auto sum_out_var_node = result.CreateVarNode(kv.first->Var());
sum_op_node->outputs.push_back(sum_out_var_node);
sum_out_var_node->inputs.push_back(sum_op_node);
created[sum_out_var_node->Name()].push_back(sum_out_var_node);
OpDesc scale_op;
scale_op.SetType("scale");
scale_op.SetInput("X", {sum_out_var_node->Var()->Name()});
// NOTE: inplace scale.
scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()});
scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kBackward));
auto scale_op_node = result.CreateOpNode(&scale_op);
scale_op_node->inputs.push_back(sum_out_var_node);
sum_out_var_node->outputs.push_back(scale_op_node);
auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var());
scale_op_node->outputs.push_back(scale_out_var_node);
scale_out_var_node->inputs.push_back(scale_op_node);
created[scale_out_var_node->Name()].push_back(scale_out_var_node);
}
// 6. add optimize ops
{
auto copy_node = [&result, &created](ir::Node* node) {
auto op_node = result.CreateOpNode(node->Op());
// copy op ins/outs
// NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe
// dependencies, so create those depvars if OpDesc have in/outs.
for (auto in_node : node->inputs) {
if (in_node->IsCtrlVar() && !in_node->Var()) {
continue;
}
ir::Node* var = nullptr;
if (created.find(in_node->Name()) == created.end()) {
var = result.CreateVarNode(in_node->Var());
created[in_node->Name()].push_back(var);
} else {
var = created.at(in_node->Name()).back();
}
op_node->inputs.push_back(var);
var->outputs.push_back(op_node);
}
for (auto out_node : node->outputs) {
if (out_node->IsCtrlVar() && !out_node->Var()) {
continue;
}
auto var = result.CreateVarNode(out_node->Var());
created[out_node->Name()].push_back(var);
op_node->outputs.push_back(var);
var->inputs.push_back(op_node);
}
};
for (auto node : lr_ops) {
copy_node(node);
}
for (auto node : optimize_ops) {
copy_node(node);
}
}
result.ResolveHazard(created);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass)
.RequirePassAttr(paddle::framework::ir::kNumRepeats);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
// BatchMergePass is used to copy forward and backward ops for several
// times to run several batches to simulate large batch size training
// as if we have more than 1 GPUs.
// User can define how many batches to run, gradients will be merged
// through those repeats, and then do optimization using merged gradients.
// This pass is extremely useful when doing large batch-size distributed
// sync training, we can simulate even large batch size as if we have more
// GPUs.
class BatchMergePass : public Pass {
public:
virtual ~BatchMergePass() {}
protected:
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -44,6 +44,7 @@ class Node { ...@@ -44,6 +44,7 @@ class Node {
return op_desc_.get(); return op_desc_.get();
} }
// Please don't use this API!
int id() const { return id_; } int id() const { return id_; }
bool IsOp() const { return type_ == Type::kOperation; } bool IsOp() const { return type_ == Type::kOperation; }
...@@ -92,6 +93,7 @@ class Node { ...@@ -92,6 +93,7 @@ class Node {
Node() = delete; Node() = delete;
static int count_; static int count_;
// Please don't use this API or make this public.
static void ResetId() { count_ = 0; } static void ResetId() { count_ = 0; }
DISABLE_COPY_AND_ASSIGN(Node); DISABLE_COPY_AND_ASSIGN(Node);
}; };
......
...@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor( ...@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
for (size_t j = 0; j < lod.size(); ++j) { for (size_t j = 0; j < lod.size(); ++j) {
auto &sub_lod = new_lod[j]; auto &sub_lod = new_lod[j];
auto &offset = sub_lod.back(); size_t offset = sub_lod.back();
for (size_t k = 1; k < lod[j].size(); ++k) { for (size_t k = 1; k < lod[j].size(); ++k) {
sub_lod.push_back(lod[j][k] + offset); sub_lod.push_back(lod[j][k] + offset);
} }
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using LoDTensorArray = std::vector<LoDTensor>; using LoDTensorArray = std::vector<LoDTensor>;
}
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> { ...@@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
this->reserve(this->size() + size_t(end - begin)); this->reserve(this->size() + size_t(end - begin));
this->insert(this->end(), begin, end); this->insert(this->end(), begin, end);
} }
const T *CUDAData(platform::Place place) const {
PADDLE_THROW(
"Vector::CUDAData() method is not supported in CPU-only version");
}
T *CUDAMutableData(platform::Place place) {
PADDLE_THROW(
"Vector::CUDAMutableData() method is not supported in CPU-only "
"version");
}
const T *Data(platform::Place place) const {
PADDLE_ENFORCE(
platform::is_cpu_place(place),
"Vector::Data() method is not supported when not in CPUPlace");
return this->data();
}
T *MutableData(platform::Place place) {
PADDLE_ENFORCE(
platform::is_cpu_place(place),
"Vector::MutableData() method is not supported when not in CPUPlace");
return this->data();
}
const void *Handle() const { return static_cast<const void *>(this); }
}; };
template <typename T> template <typename T>
......
...@@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() { ...@@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
ops_.swap(ops); ops_.swap(ops);
} }
void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
#ifdef PADDLE_WITH_MKLDNN
VLOG(3) << "use_mkldnn=True";
for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
for (auto *op : block->AllOps()) {
if (op->HasAttr("use_mkldnn")) {
op->SetAttr("use_mkldnn", true);
}
}
}
#else
LOG(WARNING)
<< "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -48,8 +48,6 @@ class NaiveExecutor { ...@@ -48,8 +48,6 @@ class NaiveExecutor {
void CleanFeedFetchOps(); void CleanFeedFetchOps();
void EnableMKLDNN(const ProgramDesc& program);
protected: protected:
void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
......
...@@ -419,8 +419,15 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -419,8 +419,15 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
} }
VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
} }
void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
void operator()(int64_t v) const { attr_->set_l(v); } void operator()(int64_t v) const { attr_->set_l(v); }
void operator()(const std::vector<int64_t> &v) const {
VectorToRepeated(v, attr_->mutable_longs());
}
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
}; };
......
...@@ -121,10 +121,6 @@ class OpDesc { ...@@ -121,10 +121,6 @@ class OpDesc {
BlockDesc *Block() { return this->block_; } BlockDesc *Block() { return this->block_; }
const BlockDesc &BlockRef() const { return *this->block_; }
void SetBlock(BlockDesc *block) { this->block_ = block; }
private: private:
template <typename MapType> template <typename MapType>
static std::vector<typename MapType::key_type> MapKeys(const MapType &map) { static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
......
...@@ -28,12 +28,12 @@ enum class OpRole { ...@@ -28,12 +28,12 @@ enum class OpRole {
kBackward = 0x0001, kBackward = 0x0001,
kOptimize = 0x0002, kOptimize = 0x0002,
// RPC role is for send/recv releated op // RPC role is for send/recv releated op
kRPC = 0x0003, kRPC = 0x0004,
// Dist role is for split_byref/split_selected_rows/concat // Dist role is for split_byref/split_selected_rows/concat
// used for distributed training. // used for distributed training.
kDist = 0x0004, kDist = 0x0008,
// Tag all learning rate scheduler operators. // Tag all learning rate scheduler operators.
kLRSched = 0x0005, kLRSched = 0x0010,
kLoss = 0x0100, kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and // The default value of op's role. This should be only used for unittests and
......
...@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() { ...@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
} }
} }
static bool VarIsTensor(const Variable* var) { static bool VarIsTensor(const Variable& var) {
return var->IsType<LoDTensor>() || var->IsType<SelectedRows>(); return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
} }
static const Tensor* GetTensorFromVar(Variable* var) { const Tensor* GetTensorFromVar(const Variable& var) {
if (var->IsType<LoDTensor>()) { if (var.IsType<LoDTensor>()) {
return var->GetMutable<LoDTensor>(); return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
} else if (var->IsType<SelectedRows>()) { } else if (var.IsType<SelectedRows>()) {
return var->GetMutable<SelectedRows>()->mutable_value(); return &(var.Get<SelectedRows>().value());
} else { } else {
PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name()); var.Type().name());
} }
} }
...@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const { ...@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
template <> template <>
const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const { const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
auto* var = InputVar(name); auto* var = InputVar(name);
return var == nullptr ? nullptr return var == nullptr ? nullptr : GetTensorFromVar(*var);
: GetTensorFromVar(const_cast<Variable*>(var));
} }
template <> template <>
...@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( ...@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
std::transform(names.begin(), names.end(), std::back_inserter(res), std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) { [&](const std::string& sub_name) {
auto var = scope_.FindVar(sub_name); auto var = scope_.FindVar(sub_name);
return var == nullptr ? nullptr : GetTensorFromVar(var); return var == nullptr ? nullptr : GetTensorFromVar(*var);
}); });
return res; return res;
} }
...@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
for (auto& var_name : inplace_vars) { for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor = auto* var = transfer_scope.FindVar(var_name);
GetTensorFromVar(transfer_scope.FindVar(var_name)); PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
var_name);
auto* transformed_tensor = GetTensorFromVar(*var);
original_tensor->ShareDataWith(*transformed_tensor); original_tensor->ShareDataWith(*transformed_tensor);
} }
} }
...@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name); auto* var = scope.FindVar(var_name);
// Only tensor can be tranfer to another device. // Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(var)) { if (var == nullptr || !VarIsTensor(*var)) {
continue; continue;
} }
auto* tensor_in = GetTensorFromVar(var); auto* tensor_in = GetTensorFromVar(*var);
if (!tensor_in->IsInitialized()) { if (!tensor_in->IsInitialized()) {
continue; continue;
} }
......
...@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) { ...@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
} }
proto::VarType::Type GetDataTypeOfVar(const Variable* var); proto::VarType::Type GetDataTypeOfVar(const Variable* var);
const Tensor* GetTensorFromVar(const Variable& var);
class OperatorBase; class OperatorBase;
class ExecutionContext; class ExecutionContext;
......
...@@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor( ...@@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor(
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
BCastParamsToDevices(bcast_vars); BCastParamsToDevices(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Create vars in each scope; // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
std::vector<details::VariableInfo> var_infos;
for (auto *var : main_program.Block(0).AllVars()) {
var_infos.emplace_back();
var_infos.back().name_ = var->Name();
var_infos.back().type_ = var->GetType();
var_infos.back().persistable_ = var->Persistable();
}
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
...@@ -156,6 +147,23 @@ ParallelExecutor::ParallelExecutor( ...@@ -156,6 +147,23 @@ ParallelExecutor::ParallelExecutor(
params, member_->local_scopes_, member_->use_cuda_); params, member_->local_scopes_, member_->use_cuda_);
#endif #endif
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std::vector<details::VariableInfo> var_infos;
for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
var_infos.emplace_back();
var_infos.back().name_ = node->Var()->Name();
var_infos.back().type_ = node->Var()->GetType();
var_infos.back().persistable_ = node->Var()->Persistable();
}
}
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
"The number of graph should be only one");
}
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph))); exec_strategy, member_->local_scopes_, places, std::move(graph)));
...@@ -179,6 +187,10 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -179,6 +187,10 @@ void ParallelExecutor::BCastParamsToDevices(
} }
auto &main_tensor = main_var->Get<LoDTensor>(); auto &main_tensor = main_var->Get<LoDTensor>();
if (!main_tensor.IsInitialized()) {
VLOG(3) << "one in var not inited, return!";
continue;
}
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -291,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -291,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
} }
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
const auto dev_ctxs = for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().GetAllDeviceContexts(); platform::DeviceContextPool::Instance().Get(p)->Wait();
for (auto &dev_ctx : dev_ctxs) {
dev_ctx->Wait();
} }
if (member_->own_local_scope_) { if (member_->own_local_scope_) {
......
...@@ -78,6 +78,8 @@ class Scope { ...@@ -78,6 +78,8 @@ class Scope {
/// Drop all kids scopes belonged to this scope. /// Drop all kids scopes belonged to this scope.
void DropKids(); void DropKids();
std::list<Scope*>& kids() const { return kids_; }
/// Find if a scope exists in the kid scopes /// Find if a scope exists in the kid scopes
bool HasKid(const Scope* scope) const; bool HasKid(const Scope* scope) const;
......
...@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) { ...@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
platform::CPUPlace()); platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
// Not sure if it's desired, but currently, Tensor type can be changed.
{
framework::Tensor src_tensor;
int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p1, nullptr);
*p1 = 1;
uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
platform::CPUPlace());
EXPECT_NE(p2, nullptr);
EXPECT_EQ(static_cast<int>(p2[0]), 1);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
......
...@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cuda_pinned_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
nullptr);
} }
#endif #endif
} }
......
...@@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0, ...@@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr); std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
std::once_flag ThreadPool::init_flag_; std::once_flag ThreadPool::init_flag_;
...@@ -47,8 +46,7 @@ void ThreadPool::Init() { ...@@ -47,8 +46,7 @@ void ThreadPool::Init() {
} }
} }
ThreadPool::ThreadPool(int num_threads) ThreadPool::ThreadPool(int num_threads) : running_(true) {
: total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
threads_.resize(num_threads); threads_.resize(num_threads);
for (auto& thread : threads_) { for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number // TODO(Yancey1989): binding the thread on the specify CPU number
...@@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads) ...@@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
ThreadPool::~ThreadPool() { ThreadPool::~ThreadPool() {
{ {
// notify all threads to stop running // notify all threads to stop running
std::lock_guard<std::mutex> l(mutex_);
running_ = false; running_ = false;
scheduled_.notify_all(); scheduled_.notify_all();
} }
...@@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() { ...@@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
} }
} }
void ThreadPool::Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
void ThreadPool::TaskLoop() { void ThreadPool::TaskLoop() {
while (running_) { while (true) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) { scheduled_.wait(
break; lock, [this] { return !this->tasks_.empty() || !this->running_; });
if (!running_ || tasks_.empty()) {
return;
} }
// pop a task from the task queue // pop a task from the task queue
auto task = std::move(tasks_.front()); auto task = std::move(tasks_.front());
tasks_.pop(); tasks_.pop();
--idle_threads_;
lock.unlock(); lock.unlock();
// run the task // run the task
task(); task();
{
std::unique_lock<std::mutex> lock(mutex_);
++idle_threads_;
if (Done()) {
completed_.notify_all();
}
}
} }
} }
......
...@@ -57,15 +57,6 @@ class ThreadPool { ...@@ -57,15 +57,6 @@ class ThreadPool {
~ThreadPool(); ~ThreadPool();
// Returns the number of threads created by the constructor.
size_t Threads() const { return total_threads_; }
// Returns the number of currently idle threads.
size_t IdleThreads() {
std::unique_lock<std::mutex> lock(mutex_);
return idle_threads_;
}
// Run pushes a function to the task queue and returns a std::future // Run pushes a function to the task queue and returns a std::future
// object. To wait for the completion of the task, call // object. To wait for the completion of the task, call
// std::future::wait(). // std::future::wait().
...@@ -94,25 +85,13 @@ class ThreadPool { ...@@ -94,25 +85,13 @@ class ThreadPool {
}); });
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future(); std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task)); tasks_.push(std::move(task));
lock.unlock();
scheduled_.notify_one(); scheduled_.notify_one();
return f; return f;
} }
// Wait until all the tasks are completed.
void Wait();
private: private:
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
// If the task queue is empty and avaialbe is equal to the number of
// threads, means that all tasks are completed. Note: this function
// is not thread-safe. Returns true if all tasks are completed.
// Note: don't delete the data member total_threads_ and use
// threads_.size() instead; because you'd need to lock the mutex
// before accessing threads_.
bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
// The constructor starts threads to run TaskLoop, which retrieves // The constructor starts threads to run TaskLoop, which retrieves
// and runs tasks from the queue. // and runs tasks from the queue.
void TaskLoop(); void TaskLoop();
...@@ -125,14 +104,11 @@ class ThreadPool { ...@@ -125,14 +104,11 @@ class ThreadPool {
static std::once_flag init_flag_; static std::once_flag init_flag_;
std::vector<std::unique_ptr<std::thread>> threads_; std::vector<std::unique_ptr<std::thread>> threads_;
const size_t total_threads_;
size_t idle_threads_;
std::queue<Task> tasks_; std::queue<Task> tasks_;
std::mutex mutex_; std::mutex mutex_;
bool running_; bool running_;
std::condition_variable scheduled_; std::condition_variable scheduled_;
std::condition_variable completed_;
}; };
class ThreadPoolIO : ThreadPool { class ThreadPoolIO : ThreadPool {
......
...@@ -19,10 +19,11 @@ limitations under the License. */ ...@@ -19,10 +19,11 @@ limitations under the License. */
namespace framework = paddle::framework; namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) { void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
std::vector<std::future<void>> fs; std::atomic<int>* sum, int cnt) {
for (int i = 0; i < cnt; ++i) { for (int i = 0; i < cnt; ++i) {
fs.push_back(framework::Async([sum]() { sum->fetch_add(1); })); std::lock_guard<std::mutex> l(*mu);
fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
} }
} }
...@@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) { ...@@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
} }
TEST(ThreadPool, ConcurrentRun) { TEST(ThreadPool, ConcurrentRun) {
framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
std::atomic<int> sum(0); std::atomic<int> sum(0);
std::vector<std::thread> threads; std::vector<std::thread> threads;
std::vector<std::future<void>> fs;
std::mutex fs_mu;
int n = 50; int n = 50;
// sum = (n * (n + 1)) / 2 // sum = (n * (n + 1)) / 2
for (int i = 1; i <= n; ++i) { for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, &sum, i); std::thread t(do_sum, &fs, &fs_mu, &sum, i);
threads.push_back(std::move(t)); threads.push_back(std::move(t));
} }
for (auto& t : threads) { for (auto& t : threads) {
t.join(); t.join();
} }
pool->Wait(); for (auto& t : fs) {
t.wait();
}
EXPECT_EQ(sum, ((n + 1) * n) / 2); EXPECT_EQ(sum, ((n + 1) * n) / 2);
} }
...@@ -36,7 +36,7 @@ using Attribute = ...@@ -36,7 +36,7 @@ using Attribute =
boost::variant<boost::blank, int, float, std::string, std::vector<int>, boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool, std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*, int64_t, std::vector<bool>, BlockDesc*, int64_t,
std::vector<BlockDesc*>>; std::vector<BlockDesc*>, std::vector<int64_t>>;
using AttributeMap = std::unordered_map<std::string, Attribute>; using AttributeMap = std::unordered_map<std::string, Attribute>;
......
if(WITH_TESTING)
include(test.cmake) # some generic cmake funtion for inference
endif()
# analysis and tensorrt must be added before creating static library, # analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library. # otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis) add_subdirectory(analysis)
...@@ -30,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -30,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
endif() endif()
# Create static library # Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
...@@ -40,7 +43,7 @@ endif() ...@@ -40,7 +43,7 @@ endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api) DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE) if(NOT APPLE)
......
...@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) ...@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
function (inference_analysis_test TARGET) function(inference_analysis_test TARGET)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS ARGS EXTRA_DEPS) set(multiValueArgs SRCS ARGS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "") inference_base_test(${TARGET}
if(WITH_GPU) SRCS ${analysis_test_SRCS}
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
endif() ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
cc_test(${TARGET} endif()
SRCS "${analysis_test_SRCS}"
DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
......
...@@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) { ...@@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) {
passes.push_back("mkldnn_placement_pass"); passes.push_back("mkldnn_placement_pass");
} }
#endif #endif
// infer_clean_graph_pass should be the first default pass
// after mkldnn_placement_pass.
passes.push_back("infer_clean_graph_pass");
for (auto& pass : ir_passes_) { for (auto& pass : ir_passes_) {
if (!disabled_ir_passes_.count(pass)) { if (!disabled_ir_passes_.count(pass)) {
passes.push_back(pass); passes.push_back(pass);
......
...@@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion. // larger fusion.
const std::vector<std::string> all_ir_passes_{{ const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here. // Manual update the passes here.
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
"embedding_fc_lstm_fuse_pass", // "embedding_fc_lstm_fuse_pass", //
...@@ -80,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -80,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", //
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
...@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type, ...@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
op->SetType(type); op->SetType(type);
op->SetInput("Xs", inputs); op->SetInput("Xs", inputs);
op->SetOutput("Xs", outputs); op->SetOutput("Xs", outputs);
op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(framework::OpRole::kForward));
} }
TEST(DataFlowGraph, Build_IR_Graph) { TEST(DataFlowGraph, Build_IR_Graph) {
......
...@@ -17,32 +17,14 @@ if(APPLE) ...@@ -17,32 +17,14 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
endif() endif()
function(inference_api_test TARGET_NAME) cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
if (WITH_TESTING) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
set(options "")
set(oneValueArgs SRC)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_test(${TARGET_NAME}
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if(inference_test_ARGS)
set_tests_properties(${TARGET_NAME}
PROPERTIES DEPENDS "${inference_test_ARGS}")
endif()
endif(WITH_TESTING)
endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
...@@ -50,10 +32,11 @@ cc_test(test_paddle_inference_api ...@@ -50,10 +32,11 @@ cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
inference_api_test(test_api_impl SRC api_impl_tester.cc if(WITH_TESTING)
ARGS test_word2vec test_image_classification) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
ARGS --dirname=${PYTHON_TESTS_DIR}/book) ARGS --dirname=${PYTHON_TESTS_DIR}/book)
...@@ -61,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -61,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
if(WITH_TESTING)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
endif() endif()
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -82,6 +82,7 @@ bool AnalysisPredictor::Init( ...@@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(
// Get the feed_target_names and fetch_target_names // Get the feed_target_names and fetch_target_names
PrepareFeedFetch(); PrepareFeedFetch();
return true; return true;
} }
...@@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true; return true;
} }
...@@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::ZeroCopyRun() {
executor_->Run(); executor_->Run();
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true; return true;
} }
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them. // concurrency problems, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
}; };
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
return true; return true;
} }
......
...@@ -26,11 +26,11 @@ limitations under the License. */ ...@@ -26,11 +26,11 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
...@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
std::vector<framework::OpDesc *> fetchs_; std::vector<framework::OpDesc *> fetchs_;
// Do not use unique_ptr, use parent scope to delete // Do not use unique_ptr, use parent scope to delete
framework::Scope *sub_scope_{nullptr}; framework::Scope *sub_scope_{nullptr};
details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
}; };
} // namespace paddle } // namespace paddle
...@@ -27,7 +27,9 @@ limitations under the License. */ ...@@ -27,7 +27,9 @@ limitations under the License. */
#define ACC_DIFF 1e-3 #define ACC_DIFF 1e-3
#endif #endif
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(word2vec_dirname, "",
"Directory of the word2vec inference model.");
DEFINE_string(book_dirname, "", "Directory of the book inference model.");
namespace paddle { namespace paddle {
...@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig GetConfig() { NativeConfig GetConfig() {
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; config.model_dir = FLAGS_word2vec_dirname;
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) { ...@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
const bool is_combined = false; const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes =
...@@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) { ...@@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) { for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone(); auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid]; auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs; std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
...@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.model_dir = config.model_dir =
FLAGS_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
std::vector<framework::LoDTensor> jobs(num_jobs); std::vector<framework::LoDTensor> jobs(num_jobs);
...@@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) { ...@@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) {
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int tid = 0; tid < num_jobs; ++tid) { for (int tid = 0; tid < num_jobs; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
auto predictor = main_predictor->Clone(); auto predictor = CreatePaddlePredictor(config);
auto& local_inputs = paddle_tensor_feeds[tid]; auto& local_inputs = paddle_tensor_feeds[tid];
std::vector<PaddleTensor> local_outputs; std::vector<PaddleTensor> local_outputs;
ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
...@@ -271,7 +273,7 @@ TEST(inference_api_native, word2vec_cpu_threads) { ...@@ -271,7 +273,7 @@ TEST(inference_api_native, word2vec_cpu_threads) {
MainThreadsWord2Vec(false /*use_gpu*/); MainThreadsWord2Vec(false /*use_gpu*/);
} }
TEST(inference_api_native, image_classification_cpu) { TEST(inference_api_native, image_classification_cpu) {
MainThreadsImageClassification(false /*use_gpu*/); MainImageClassification(false /*use_gpu*/);
} }
TEST(inference_api_native, image_classification_cpu_threads) { TEST(inference_api_native, image_classification_cpu_threads) {
MainThreadsImageClassification(false /*use_gpu*/); MainThreadsImageClassification(false /*use_gpu*/);
...@@ -279,15 +281,17 @@ TEST(inference_api_native, image_classification_cpu_threads) { ...@@ -279,15 +281,17 @@ TEST(inference_api_native, image_classification_cpu_threads) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
TEST(inference_api_native, word2vec_gpu_threads) { // Turn off temporarily for the unstable result.
MainThreadsWord2Vec(true /*use_gpu*/); // TEST(inference_api_native, word2vec_gpu_threads) {
} // MainThreadsWord2Vec(true /*use_gpu*/);
// }
TEST(inference_api_native, image_classification_gpu) { TEST(inference_api_native, image_classification_gpu) {
MainThreadsImageClassification(true /*use_gpu*/); MainImageClassification(true /*use_gpu*/);
}
TEST(inference_api_native, image_classification_gpu_threads) {
MainThreadsImageClassification(true /*use_gpu*/);
} }
// Turn off temporarily for the unstable result.
// TEST(inference_api_native, image_classification_gpu_threads) {
// MainThreadsImageClassification(true /*use_gpu*/);
// }
#endif #endif
......
...@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { ...@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
//# 1. Create PaddlePredictor with a config. //# 1. Create PaddlePredictor with a config.
NativeConfig config0; NativeConfig config0;
config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; config0.model_dir = FLAGS_dirname;
config0.use_gpu = true; config0.use_gpu = true;
config0.fraction_of_gpu_memory = 0.3; config0.fraction_of_gpu_memory = 0.3;
config0.device = 0; config0.device = 0;
MixedRTConfig config1; MixedRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; config1.model_dir = FLAGS_dirname;
config1.use_gpu = true; config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3; config1.fraction_of_gpu_memory = 0.3;
config1.device = 0; config1.device = 0;
......
...@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}") ...@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
if (NOT WIN32) if (NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
...@@ -61,8 +62,8 @@ endif(NOT WIN32) ...@@ -61,8 +62,8 @@ endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3") include_directories("${PADDLE_LIB}/third_party/eigen3")
if (NOT WIN32) if (NOT WIN32)
if (USE_TENSORRT AND WITH_GPU) if (USE_TENSORRT AND WITH_GPU)
include_directories("${TENSORRT_INCLUDE_DIR}") include_directories("${TENSORRT_INCLUDE_DIR}")
link_directories("${TENSORRT_LIB_DIR}") link_directories("${TENSORRT_LIB_DIR}")
endif() endif()
...@@ -77,13 +78,14 @@ endif(NOT WIN32) ...@@ -77,13 +78,14 @@ endif(NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/paddle/lib") link_directories("${PADDLE_LIB}/paddle/lib")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL) if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include") include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH}) if(EXISTS ${MKLDNN_PATH})
...@@ -107,7 +109,7 @@ if (NOT WIN32) ...@@ -107,7 +109,7 @@ if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS} set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB}) ${EXTERNAL_LIB})
else() else()
set(DEPS ${DEPS} set(DEPS ${DEPS}
...@@ -120,7 +122,7 @@ endif(NOT WIN32) ...@@ -120,7 +122,7 @@ endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
if(NOT WIN32) if(NOT WIN32)
if (USE_TENSORRT) if (USE_TENSORRT)
set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
endif() endif()
......
...@@ -16,7 +16,7 @@ if [ $2 == ON ]; then ...@@ -16,7 +16,7 @@ if [ $2 == ON ]; then
fi fi
if [ $3 == ON ]; then if [ $3 == ON ]; then
use_gpu_list='true false' use_gpu_list='true false'
else else
use_gpu_list='false' use_gpu_list='false'
fi fi
...@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU=$TEST_GPU_CPU \ -DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB -DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j make -j
word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
if [ -d $word2vec_model ]; then if [ -d $word2vec_model ]; then
for use_gpu in $use_gpu_list; do for use_gpu in $use_gpu_list; do
./simple_on_word2vec \ ./simple_on_word2vec \
...@@ -83,7 +83,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -83,7 +83,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_STATIC_LIB=$WITH_STATIC_LIB -DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j make -j
for use_gpu in $use_gpu_list; do for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do for vis_demo_name in $vis_demo_list; do
./vis_demo \ ./vis_demo \
--modeldir=$DATA_DIR/$vis_demo_name/model \ --modeldir=$DATA_DIR/$vis_demo_name/model \
--data=$DATA_DIR/$vis_demo_name/data.txt \ --data=$DATA_DIR/$vis_demo_name/data.txt \
...@@ -95,7 +95,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -95,7 +95,7 @@ for WITH_STATIC_LIB in ON OFF; do
fi fi
done done
done done
# --------tensorrt mobilenet------ # --------tensorrt mobilenet------
if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
rm -rf * rm -rf *
...@@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do ...@@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DUSE_TENSORRT=$USE_TENSORRT \ -DUSE_TENSORRT=$USE_TENSORRT \
-DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
make -j make -j
./trt_mobilenet_demo \ ./trt_mobilenet_demo \
--modeldir=$DATA_DIR/mobilenet/model \ --modeldir=$DATA_DIR/mobilenet/model \
--data=$DATA_DIR/mobilenet/data.txt \ --data=$DATA_DIR/mobilenet/data.txt \
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
namespace paddle {
namespace details {
// Should be called after the parameters are loaded.
void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
if (flag_) {
for (auto &var_name : scope->LocalVarNames()) {
auto *var = scope->FindVar(var_name);
// TODO(Superjomn) should avoid the case when a TensorArray is a
// parameter.
if (var_name == "feed" || var_name == "fetch") continue;
if (var->Type() == typeid(framework::LoDTensorArray)) {
VLOG(4) << "collect " << var_name;
arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
}
}
for (auto *kid : scope->kids()) {
CollectTensorArrays(kid);
}
VLOG(3) << "Collect " << arrays_.size() << " arrays";
flag_ = false;
}
}
// Should be called when `Run` finished.
void TensorArrayBatchCleaner::ResetTensorArray() {
for (auto *arr : arrays_) {
arr->clear();
}
}
} // namespace details
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace details {
// Clean the TensorArray each batch to make the behavior the same with the
// training phase.
struct TensorArrayBatchCleaner {
// Fix the tensor array not clear in the inference scenarios.
void CollectTensorArrays(framework::Scope *scope);
void ResetTensorArray();
private:
bool flag_{true};
std::vector<framework::LoDTensorArray *> arrays_;
};
} // namespace details
} // namespace paddle
...@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, ...@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency, int epoch = 1) { double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid << ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======"; << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
<< " ======";
if (epoch > 1) { if (epoch > 1) {
int samples = batch_size * epoch; int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples LOG(INFO) << "====== sample number: " << samples
......
...@@ -124,7 +124,7 @@ class ZeroCopyTensor { ...@@ -124,7 +124,7 @@ class ZeroCopyTensor {
std::vector<std::vector<size_t>> lod() const; std::vector<std::vector<size_t>> lod() const;
protected: protected:
ZeroCopyTensor(void* scope) : scope_{scope} {} explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
void SetName(const std::string& name) { name_ = name; } void SetName(const std::string& name) { name_ = name; }
void* FindTensor() const; void* FindTensor() const;
...@@ -259,12 +259,6 @@ struct AnalysisConfig : public NativeConfig { ...@@ -259,12 +259,6 @@ struct AnalysisConfig : public NativeConfig {
kExclude // Specify the disabled passes in `ir_passes`. kExclude // Specify the disabled passes in `ir_passes`.
}; };
void SetIncludeMode() {
ir_mode = IrPassMode::kInclude;
// this pass has to be run at the beginning of all fuse passes
ir_passes = {"infer_clean_graph_pass"};
}
// Determine whether to perform graph optimization. // Determine whether to perform graph optimization.
bool enable_ir_optim = true; bool enable_ir_optim = true;
// Manually determine the IR passes to run. // Manually determine the IR passes to run.
......
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
function (inference_base_test TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS ARGS DEPS)
cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif()
cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
endfunction()
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
message(STATUS "finish downloading ${filename}")
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
endfunction()
function(download_model_and_data install_dir model_name data_name) function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir}) if (NOT EXISTS ${install_dir})
......
...@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_rnn1, profile) { TEST(Analyzer_rnn1, profile) {
contrib::AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.use_gpu = false;
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
......
...@@ -139,6 +139,9 @@ void TestMultiThreadPrediction( ...@@ -139,6 +139,9 @@ void TestMultiThreadPrediction(
} }
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
#ifdef PADDLE_WITH_MKLDNN
platform::set_cur_thread_id(static_cast<int>(tid) + 1);
#endif
// Each thread should have local inputs and outputs. // Each thread should have local inputs and outputs.
// The inputs of each thread are all the same. // The inputs of each thread are all the same.
std::vector<std::vector<PaddleTensor>> inputs_tid = inputs; std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
......
...@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND) ...@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
else() else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
endif() endif()
op_library(hash_op DEPS xxhash)
op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
op_library(sum_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor)
...@@ -300,6 +301,7 @@ op_library(flatten_op DEPS reshape_op) ...@@ -300,6 +301,7 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding) op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory) op_library(fake_quantize_op DEPS memory)
op_library(crf_decoding_op DEPS jit_kernel)
op_library(fusion_lstm_op DEPS jit_kernel) op_library(fusion_lstm_op DEPS jit_kernel)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/add_position_encoding_op.h"
namespace paddle {
namespace operators {
class AddPositionEncodingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"X(Input) of add_position_encoding_op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Out(Output) of add_position_encoding_op should not be null.");
auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Out@GRAD must not be null.");
auto out_dims = ctx->GetInputDim("Out");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
}
}
};
class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of AddPositionEncoding operator");
AddOutput("Out", "Output of AddPositionEncoding operator");
AddAttr<float>("alpha", "The scale of Original Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& alpha) {
PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
});
AddAttr<float>("beta", "The scale of Position Embedding.")
.SetDefault(1.0f)
.AddCustomChecker([](const float& beta) {
PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
});
AddComment(R"DOC(
Add Position Encoding Operator.
The add position encoding calculates the output based on the input, alpha, beta.
The size of each dimension of the parameters checked in the infer-shape.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plt = paddle::platform;
REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
ops::AddPositionEncodingOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
REGISTER_OP_CPU_KERNEL(
add_position_encoding,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
add_position_encoding_grad,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AddPositionEncodingKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::LoDTensor>("X");
auto& x_lod = X->lod();
auto* src_ptr = X->data<T>();
auto* Out = context.Output<framework::LoDTensor>("Out");
auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
float alpha = context.Attr<float>("alpha");
float beta = context.Attr<float>("beta");
auto x_dim = X->dims();
int batch_size = 0;
int max_seq_len = 0;
int enc_size = 0;
if (x_lod.empty()) {
PADDLE_ENFORCE(
x_dim.size() == 3UL,
"The input X of Add Position Encoding should be 3-D Tensor!");
batch_size = x_dim[0];
max_seq_len = x_dim[1];
enc_size = x_dim[2];
} else {
PADDLE_ENFORCE(
x_dim.size() == 2UL,
"The input X of Add Position Encoding should be 2-D LoDTensor!");
PADDLE_ENFORCE(
x_lod.size() == 1UL,
"The Add Position Encoding Op only supports lod_level == 1!");
batch_size = x_lod[0].size() - 1;
max_seq_len = -1;
enc_size = x_dim[1];
}
PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
const int half_size = enc_size / 2;
for (int i = 0; i < batch_size; ++i) {
const int max_length =
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
for (int j = 0; j < max_length; ++j) {
for (int k = 0; k < half_size; ++k) {
const double val = (half_size > 1)
? j / pow(10000.0, double(k) / (half_size - 1))
: j / 10000.0;
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
dst_ptr[half_size + k] =
src_ptr[half_size + k] * alpha + cos(val) * beta;
}
src_ptr += enc_size;
dst_ptr += enc_size;
}
}
}
};
template <typename DeviceContext, typename T>
class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* dOut =
context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto* dX =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dX);
float alpha = context.Attr<float>("alpha");
auto* place =
context.template device_context<DeviceContext>().eigen_device();
dx.device(*place) = dout * static_cast<T>(alpha);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedSpatialTransformerDescriptor =
platform::ScopedSpatialTransformerDescriptor;
template <typename T>
class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto* theta = ctx.Input<Tensor>("Theta");
auto* output = ctx.Output<Tensor>("Output");
const T* theta_data = theta->data<T>();
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
T* output_data = output->mutable_data<T>(
{n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
handle, cudnn_st_desc, theta_data, output_data));
}
};
template <typename T>
class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace.");
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
Tensor h_sizes;
int* h_size_data;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
h_size_data = h_sizes.data<int>();
} else {
h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
h_size_data[0] = n;
h_size_data[1] = size_attr[1];
h_size_data[2] = size_attr[2];
h_size_data[3] = size_attr[3];
}
ScopedSpatialTransformerDescriptor st_desc;
cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
st_desc.descriptor<T>(4, h_size_data);
const T* output_grad_data = output_grad->data<T>();
T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
handle, cudnn_st_desc, output_grad_data, theta_grad_data));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridOpKernel<float>,
paddle::operators::CUDNNAffineGridOpKernel<double>);
REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNAffineGridGradOpKernel<float>,
paddle::operators::CUDNNAffineGridGradOpKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/affine_grid_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
struct Linspace<paddle::platform::CPUDeviceContext, T> {
framework::Tensor operator()(T start, T end, int count,
const framework::ExecutionContext& ctx) {
Tensor numbers;
T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
T slice = (end - start) / (T)(count - 1);
for (int i = 0; i < count; ++i) {
number_data[i] = start + (T)i * slice;
}
return numbers;
}
};
class AffineGridOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Theta"),
"Input(Theta) of AffineGridOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of AffineGridOp should not be null.");
auto theta_dims = ctx->GetInputDim("Theta");
PADDLE_ENFORCE(theta_dims.size() == 3,
"AffineGrid's Input(Theta) should be 3-D tensor.");
auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
if (output_shape.size() == 0) {
PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
"Input(OutputShape) of AffineGridOp should not be null if "
"attr(output_shape) is not configured.");
auto output_shape_dims = ctx->GetInputDim("OutputShape");
PADDLE_ENFORCE(output_shape_dims.size() == 1,
"AffineGrid's Input(OutputShape) should be 1-D tensor.");
} else {
PADDLE_ENFORCE(output_shape.size() == 4,
"The size of attr(output_shape) should be 4.");
}
PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
// N * H * W * 2
ctx->SetOutputDim("Output",
framework::make_ddim({theta_dims[0], -1, -1, 2}));
ctx->ShareLoD("Theta", "Output");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library = framework::LibraryType::kCUDNN;
}
#endif
auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
return framework::OpKernelType(data_type, ctx.GetPlace(),
framework::DataLayout::kAnyLayout, library);
}
};
class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Theta",
"(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
"It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
"y_1).");
AddInput("OutputShape",
"(Tensor) The shape of target image with format [N, C, H, W].")
.AsDispensable();
AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(true);
AddAttr<std::vector<int>>(
"output_shape",
"The target output image shape with format [N, C, H, W].")
.SetDefault(std::vector<int>());
AddComment(R"DOC(
It generates a grid of (x,y) coordinates using the parameters of the
affine transformation that correspond to a set of points where the input
feature map should be sampled to produce the transformed output feature map.
Given:
Theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
OutputShape = [2, 3, 5, 5]
Step 1:
Generate relative coordinates according to OutputShape.
The values of relative coordinates are in the interval between -1 and 1.
The shape of the relative coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
)DOC");
}
};
class AffineGridOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto theta_dims = ctx->GetInputDim("Theta");
if (ctx->HasOutput(framework::GradVarName("Theta"))) {
ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
}
};
class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("affine_grid_grad");
op->SetInput("Theta", Input("Theta"));
op->SetInput("OutputShape", Input("OutputShape"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
ops::AffineGridGradMaker);
REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
REGISTER_OP_CPU_KERNEL(
affine_grid,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
affine_grid_grad,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
using Array1 = Eigen::DSizes<int64_t, 1>;
using Array2 = Eigen::DSizes<int64_t, 2>;
using Array3 = Eigen::DSizes<int64_t, 3>;
using Array4 = Eigen::DSizes<int64_t, 4>;
/**
*Return a tensor with evenly spaced numbers over a specified interval.
*/
template <typename DeviceContext, typename T>
struct Linspace {
framework::Tensor operator()(T start, T end, int count,
const framework::ExecutionContext& ctx);
};
template <typename DeviceContext, typename T>
class AffineGridOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto* theta = ctx.Input<Tensor>("Theta");
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
auto* output = ctx.Output<Tensor>("Output");
output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0));
Linspace<DeviceContext, T> linspace;
// Get indexes of height with shape [height, width, 1]
auto h_idx = linspace((T)-1, (T)1, h, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1]
auto w_idx = linspace((T)-1, (T)1, w, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1]
Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor grid;
grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
auto grid_t = EigenTensor<T, 4>::From(grid);
grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1))
.concatenate(h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1))
.shuffle(Array2(1, 0))
.reshape(Array3(h, w, 1)),
2)
.eval()
.concatenate(ones_t, 2)
.reshape(Array4(1, h, w, 3))
.broadcast(Array4(n, 1, 1, 1));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
T(0));
}
}
};
template <typename DeviceContext, typename T>
class AffineGridGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), theta_grad,
static_cast<T>(0));
Linspace<DeviceContext, T> linspace;
// Get indexes of height with shape [height, width, 1]
auto h_idx = linspace((T)-1, (T)1, h, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1]
auto w_idx = linspace((T)-1, (T)1, w, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1]
Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor grid;
grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
auto grid_t = EigenTensor<T, 4>::From(grid);
grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1))
.concatenate(h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1))
.shuffle(Array2(1, 0))
.reshape(Array3(h, w, 1)),
2)
.eval()
.concatenate(ones_t, 2)
.reshape(Array4(1, h, w, 3))
.broadcast(Array4(n, 1, 1, 1));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
&sliced_theta_grad, T(0));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor { ...@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor {
bool tensor_on_gpu_; bool tensor_on_gpu_;
size_t beam_size_; size_t beam_size_;
int end_id_; int end_id_;
// TODO(Superjomn) Here might result serious performance issue in the
// concurrency
// scenarios.
const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_ids_origin_;
const LoDTensorArray& step_scores_origin_; const LoDTensorArray& step_scores_origin_;
LoDTensorArray step_ids_ = LoDTensorArray(); LoDTensorArray step_ids_ = LoDTensorArray();
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <limits> #include <limits>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
...@@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> { ...@@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
auto emission_dims = emission_weights.dims(); auto emission_dims = emission_weights.dims();
const size_t seq_len = emission_dims[0]; const size_t seq_len = emission_dims[0];
const size_t tag_num = emission_dims[1]; const size_t tag_num = emission_dims[1];
const size_t state_trans_base_idx = 2;
const T* x = emission_weights.data<T>(); const T* x = emission_weights.data<T>();
const T* w = transition_weights.data<T>(); const T* w = transition_weights.data<T>();
int64_t* path = decoded_path->data<int64_t>(); int64_t* path = decoded_path->data<int64_t>();
...@@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> { ...@@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
Tensor track; Tensor track;
int* track_value = int* track_value =
track.mutable_data<int>(emission_dims, platform::CPUPlace()); track.mutable_data<int>(emission_dims, platform::CPUPlace());
const auto& ker = math::jitkernel::KernelPool::Instance()
#ifdef __AVX__ .template Get<math::jitkernel::CRFDecodeKernel<T>>(
// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or static_cast<int>(tag_num));
// 16 elements per iteration. Then it can implement the parallel processing. ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
// Only optimize for float type.
#ifdef __AVX512F__
size_t step_size = 16;
#else
size_t step_size = 8;
#endif
if (std::is_same<T, float>::value && (tag_num >= step_size)) {
size_t steps = tag_num / step_size;
size_t remain = tag_num % step_size;
int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
// Setup the alpha initial value.
size_t i_offset = 0;
for (size_t i = 0; i <= steps; ++i) {
#ifdef __AVX512F__
// Declare the variable for the content of weights, input and alpha
// values.
__m512 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm512_loadu_ps((const float*)(w + i_offset));
x_content = _mm512_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm512_add_ps(w_content, x_content);
// Save the alpha value.
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#else
// Declare the variable for the content of weights, input and alpha
// values.
__m256 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm256_loadu_ps((const float*)(w + i_offset));
x_content = _mm256_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm256_add_ps(w_content, x_content);
// Save the alpha value.
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#endif
i_offset += step_size;
if (i == steps - 1) {
if (remain > 0) {
i_offset += last_offset;
} else {
break;
}
}
}
// Use the column-major strategy to get the location of maximum score.
size_t seq_offset = 0;
for (size_t k = 1; k < seq_len; ++k) {
size_t j_offset = 0;
for (size_t j = 0; j <= steps; ++j) {
#ifdef __AVX512F__
// Initialize the variables of maximum score and location.
__m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
__m512i max_j = _mm512_setzero_si512();
#else
// Initialize the variables of maximum score and location.
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
__m256i max_j = _mm256_set1_epi32(0);
#endif
// Calculate the offset of transition_weights.
size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
for (size_t i = 0; i < tag_num; ++i) {
#ifdef __AVX512F__
// Initalize the content of alpha variable with related offset.
__m512 alpha_content =
_mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m512 w_content =
_mm512_loadu_ps((const float*)(w + trans_offset));
__m512 score_v = _mm512_add_ps(alpha_content, w_content);
__mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm512_mask_set1_epi32(max_j, mask, i);
// Update the max_score value.
max_score = _mm512_max_ps(max_score, score_v);
#else
// Initalize the content of alpha variable with related offset.
__m256 alpha_content = _mm256_broadcast_ss(
(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m256 w_content =
_mm256_loadu_ps((const float*)(w + trans_offset));
__m256 score_v = _mm256_add_ps(alpha_content, w_content);
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
#ifdef __AVX2__
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_or_si256(
_mm256_andnot_si256((__m256i)mask, max_j),
_mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
#else
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
#endif
// Update the max_score value.
max_score = _mm256_max_ps(max_score, score_v);
#endif
trans_offset += tag_num;
}
#ifdef __AVX512F__
// Update the alpha and track values.
__m512 x_content = _mm512_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm512_add_ps(max_score, x_content);
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm512_storeu_si512(
reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#else
// Update the alpha and track values.
__m256 x_content = _mm256_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm256_add_ps(max_score, x_content);
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#endif
// Calculate the offset of next step
j_offset += step_size;
if (j == steps - 1) {
if (remain > 0) {
j_offset += last_offset;
} else {
break;
}
}
}
seq_offset += tag_num;
}
} else {
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) {
for (size_t i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] +
w[(j + state_trans_base_idx) * tag_num + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
track_value[k * tag_num + i] = max_j;
}
}
}
#else
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) {
for (size_t i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] +
w[(j + state_trans_base_idx) * tag_num + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
track_value[k * tag_num + i] = max_j;
}
}
#endif
T max_score = -std::numeric_limits<T>::max(); T max_score = -std::numeric_limits<T>::max();
int max_i = 0; int max_i = 0;
for (size_t i = 0; i < tag_num; ++i) { for (size_t i = 0; i < tag_num; ++i) {
......
...@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase { ...@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
} }
}; };
class DeleteVarOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -48,4 +53,5 @@ It should not be configured by users directly. ...@@ -48,4 +53,5 @@ It should not be configured by users directly.
REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker,
paddle::operators::DeleteVarOpInfoMaker); paddle::operators::DeleteVarOpInfoMaker,
paddle::operators::DeleteVarOpShapeInference);
...@@ -439,31 +439,88 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -439,31 +439,88 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
// TODO(buxingyuan): Add Document AddInput(
AddInput("RpnRois", "RpnRois."); "RpnRois",
AddInput("GtClasses", "GtClasses."); "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. "
AddInput("IsCrowd", "IsCrowd."); "N is the number of the GenerateProposalOp's output, "
AddInput("GtBoxes", "GtBoxes."); "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
AddInput("ImInfo", "ImInfo."); AddInput("GtClasses",
"(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
AddOutput("Rois", "Rois."); "M is the number of groundtruth, "
AddOutput("LabelsInt32", "LabelsInt32."); "each element is a class label of groundtruth.");
AddOutput("BboxTargets", "BboxTargets."); AddInput(
AddOutput("BboxInsideWeights", "BboxInsideWeights."); "IsCrowd",
AddOutput("BboxOutsideWeights", "BboxOutsideWeights."); "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
"M is the number of groundtruth, "
AddAttr<int>("batch_size_per_im", "batch_size_per_im"); "each element is a flag indicates whether a groundtruth is crowd.");
AddAttr<float>("fg_fraction", "fg_fraction"); AddInput(
AddAttr<float>("fg_thresh", "fg_thresh"); "GtBoxes",
AddAttr<float>("bg_thresh_hi", "bg_thresh_hi"); "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. "
AddAttr<float>("bg_thresh_lo", "bg_thresh_lo"); "M is the number of groundtruth, "
AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights"); "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
AddAttr<int>("class_nums", "class_nums"); AddInput("ImInfo",
AddAttr<bool>("use_random", "use_random").SetDefault(true); "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
"B is the number of input images, "
"each element consists of im_height, im_width, im_scale.");
AddOutput(
"Rois",
"(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
"P usuall equal to batch_size_per_im * batch_size, "
"each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
AddOutput("LabelsInt32",
"(LoDTensor), This output is a 2D LoDTensor with shape [P], "
"each element repersents a class label of a roi");
AddOutput("BboxTargets",
"(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
"class_nums], "
"each element repersents a box label of a roi");
AddOutput(
"BboxInsideWeights",
"(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
"class_nums], "
"each element indicates whether a box should contribute to loss.");
AddOutput(
"BboxOutsideWeights",
"(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
"class_nums], "
"each element indicates whether a box should contribute to loss.");
AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
AddAttr<float>("fg_fraction",
"Foreground fraction in total batch_size_per_im.");
AddAttr<float>(
"fg_thresh",
"Overlap threshold which is used to chose foreground sample.");
AddAttr<float>("bg_thresh_hi",
"Overlap threshold upper bound which is used to chose "
"background sample.");
AddAttr<float>("bg_thresh_lo",
"Overlap threshold lower bound which is used to chose "
"background sample.");
AddAttr<std::vector<float>>("bbox_reg_weights", "Box regression weights.");
AddAttr<int>("class_nums", "Class number.");
AddAttr<bool>(
"use_random",
"Use random sampling to choose foreground and background boxes.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals Labels Operator. This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
)DOC"); to sample foreground boxes and background boxes, and compute loss target.
RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
then it was considered as a background sample.
After all foreground and background boxes are chosen (so called Rois),
then we apply random sampling to make sure
the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
)DOC");
} }
}; };
......
...@@ -284,7 +284,7 @@ static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, ...@@ -284,7 +284,7 @@ static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
selected_indices.push_back(idx); selected_indices.push_back(idx);
++selected_num; ++selected_num;
} }
sorted_indices.erase(sorted_indices.end()); sorted_indices.erase(sorted_indices.end() - 1);
if (flag && eta < 1 && adaptive_threshold > 0.5) { if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta; adaptive_threshold *= eta;
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
"will be dropped.") "will be dropped.")
.SetDefault(false); .SetDefault(false);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0); AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddAttr<std::string>(
"dropout_implementation",
"[\"downgrade_in_infer\"|\"upscale_in_train\"]"
"There are two kinds of ways to implement dropout"
"(the mask below is a tensor have the same shape with input"
"the value of mask is 0 or 1, the ratio of 0 is dropout_prob)"
"1. downgrade_in_infer(default), downgrade the outcome at inference "
"time"
" train: out = input * mask"
" inference: out = input * dropout_prob"
"2. upscale_in_train, upscale the outcome at training time, do nothing "
"in inference"
" train: out = input * mask / ( 1.0 - dropout_prob )"
" inference: out = input"
" dropout op can be removed from the program. the program will be "
"efficient")
.SetDefault("downgrade_in_infer")
.AddCustomChecker([](const std::string& type) {
PADDLE_ENFORCE(
type == "downgrade_in_infer" || type == "upscale_in_train",
"dropout_implementation can only be downgrade_in_infer or "
"upscale_in_train");
});
AddComment(R"DOC( AddComment(R"DOC(
Dropout Operator. Dropout Operator.
...@@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ...@@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>); dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout_grad, dropout_grad,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>); ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h> #include <thrust/random.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include <string>
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -26,7 +27,8 @@ namespace operators { ...@@ -26,7 +27,8 @@ namespace operators {
template <typename T> template <typename T>
__global__ void RandomGenerator(const size_t n, const int seed, __global__ void RandomGenerator(const size_t n, const int seed,
const float dropout_prob, const T* src, const float dropout_prob, const T* src,
T* mask_data, T* dst) { T* mask_data, T* dst,
bool is_upscale_in_train) {
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed); rng.seed(seed);
thrust::uniform_real_distribution<float> dist(0, 1); thrust::uniform_real_distribution<float> dist(0, 1);
...@@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed, ...@@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed,
if (dist(rng) < dropout_prob) { if (dist(rng) < dropout_prob) {
mask = static_cast<T>(0); mask = static_cast<T>(0);
} else { } else {
mask = static_cast<T>(1); if (is_upscale_in_train) {
mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
} else {
mask = static_cast<T>(1);
}
} }
dest = s * mask; dest = s * mask;
mask_data[idx] = mask; mask_data[idx] = mask;
...@@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
y->mutable_data<T>(context.GetPlace()); y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation =
context.Attr<std::string>("dropout_implementation");
auto& place = *context.template device_context<Place>().eigen_device(); auto& place = *context.template device_context<Place>().eigen_device();
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
...@@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
int grid = (x->numel() + threads - 1) / threads; int grid = (x->numel() + threads - 1) / threads;
RandomGenerator< RandomGenerator<
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>( T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, seed, dropout_prob, x_data, mask_data, y_data); size, seed, dropout_prob, x_data, mask_data, y_data,
(dropout_implementation == "upscale_in_train"));
} else { } else {
auto X = EigenMatrix<T>::Reshape(*x, 1); auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1); auto Y = EigenMatrix<T>::Reshape(*y, 1);
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob); if (dropout_implementation == "upscale_in_train") {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
}
} }
} }
}; };
...@@ -99,6 +112,8 @@ namespace ops = paddle::operators; ...@@ -99,6 +112,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>, dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>); ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
REGISTER_OP_CUDA_KERNEL(dropout_grad, ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
ops::DropoutGradKernel<plat::CUDADeviceContext, float>); REGISTER_OP_CUDA_KERNEL(
dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <random> #include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
auto* y_data = y->mutable_data<T>(context.GetPlace()); auto* y_data = y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation =
context.Attr<std::string>("dropout_implementation");
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
...@@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
engine.seed(seed); engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1); std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(mask->dims()); size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
if (dist(engine) < dropout_prob) { if (dist(engine) < dropout_prob) {
mask_data[i] = 0; mask_data[i] = 0;
y_data[i] = 0; y_data[i] = 0;
} else { } else {
mask_data[i] = 1; if (dropout_implementation == "upscale_in_train") {
y_data[i] = x_data[i]; mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
} else {
mask_data[i] = 1;
y_data[i] = x_data[i];
}
} }
} }
} else { } else {
...@@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
auto Y = EigenMatrix<T>::Reshape(*y, 1); auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place = auto& place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
Y.device(place) = X * (1.0f - dropout_prob); if (dropout_implementation == "upscale_in_train") {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
}
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
class FakeInitInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FakeInitOp should not be null.");
auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
};
class FakeInitOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
framework::Tensor *tensor = nullptr;
auto &out_var = *scope.FindVar(Output("Out"));
if (out_var.IsType<framework::LoDTensor>()) {
tensor = out_var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else if (out_var.IsType<framework::SelectedRows>()) {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(
"fake init op's output only"
"supports SelectedRows and LoDTensor");
}
}
};
class FakeInitOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output");
AddOutput("Out",
"(Tensor) Tensor of specified shape will be filled "
"with the specified value");
AddComment(R"DOC(
FakeInit Operator.
Init an variable but not alloc memory for it, it is used for init the
table parameter at trainer side in distributed lookup table.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fake_init, ops::FakeInitOp, ops::FakeInitInferShape,
ops::FakeInitOpMaker, paddle::framework::EmptyGradOpMaker,
ops::FakeInitOpVarTypeInference);
...@@ -24,7 +24,7 @@ class FillConstantInferShape : public framework::InferShapeBase { ...@@ -24,7 +24,7 @@ class FillConstantInferShape : public framework::InferShapeBase {
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null."); "Output(Out) of FillConstantOp should not be null.");
auto &shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape)); ctx->SetOutputDim("Out", framework::make_ddim(shape));
} }
}; };
...@@ -47,10 +47,10 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -47,10 +47,10 @@ class FillConstantOp : public framework::OperatorBase {
if (out_var.IsType<framework::LoDTensor>()) { if (out_var.IsType<framework::LoDTensor>()) {
tensor = out_var.GetMutable<framework::LoDTensor>(); tensor = out_var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape"))); tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else if (out_var.IsType<framework::SelectedRows>()) { } else if (out_var.IsType<framework::SelectedRows>()) {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value(); tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape"))); tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"fill constant op's output only" "fill constant op's output only"
...@@ -83,7 +83,8 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -83,7 +83,8 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
"(int, default 5 (FP32)) " "(int, default 5 (FP32)) "
"Output data type") "Output data type")
.SetDefault(framework::proto::VarType::FP32); .SetDefault(framework::proto::VarType::FP32);
AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output"); AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output");
AddAttr<float>("value", "(float, default 0) The value to be filled") AddAttr<float>("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>("force_cpu", AddAttr<bool>("force_cpu",
......
...@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ...@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
ops::GatherOpKernel<int>, ops::GatherOpKernel<double>); ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
ops::GatherOpKernel<int64_t>);
REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>, REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
ops::GatherGradientOpKernel<double>,
ops::GatherGradientOpKernel<int>, ops::GatherGradientOpKernel<int>,
ops::GatherGradientOpKernel<double>); ops::GatherGradientOpKernel<int64_t>);
...@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>); REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>); ops::GatherOpCUDAKernel<double>,
ops::GatherOpCUDAKernel<int64_t>,
ops::GatherOpCUDAKernel<int>);
REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
ops::GatherGradOpCUDAKernel<double>,
ops::GatherGradOpCUDAKernel<int64_t>,
ops::GatherGradOpCUDAKernel<int>);
...@@ -52,7 +52,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -52,7 +52,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of GaussianRandomOp should not be null."); "Output(Out) of GaussianRandomOp should not be null.");
auto shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
std::vector<int64_t> temp; std::vector<int64_t> temp;
temp.reserve(shape.size()); temp.reserve(shape.size());
for (auto dim : shape) { for (auto dim : shape) {
...@@ -88,9 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -88,9 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddOutput("Out", "Output matrix of gaussian random op"); AddOutput("Out", "Output matrix of gaussian random op");
AddAttr<std::vector<int>>("shape", AddAttr<std::vector<int64_t>>("shape",
"(vector<int>) " "(vector<int64_t>) "
"The dimension of random tensor."); "The dimension of random tensor.");
AddAttr<float>("mean", AddAttr<float>("mean",
"(float, default 0.0) " "(float, default 0.0) "
"mean of random tensor.") "mean of random tensor.")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/hash_op.h"
#include <string>
#include <vector>
namespace paddle {
namespace operators {
class HashOp : public framework::OperatorWithKernel {
public:
HashOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of HashOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of HashOp should not be null.");
auto dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(dims.size(), 2UL,
"The input of hash_op's dimensions must be 2");
std::vector<int64_t> out_dims;
out_dims.reserve(dims.size() + 1);
// copy all dims except the last one
for (size_t i = 0u; i != dims.size() - 1; ++i) {
out_dims.emplace_back(dims[i]);
}
int num_hash = ctx->Attrs().Get<int>("num_hash");
out_dims.emplace_back(num_hash);
// keep the last dim to 1
out_dims.emplace_back(1);
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class HashOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input tensor of scale operator.");
AddOutput("Out", "(Tensor) Output tensor of scale operator.");
AddComment(R"DOC(
**Hash Operator**
$$Out = scale * X$$
)DOC");
AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
extern "C" {
#include <xxhash.h>
}
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
// template <typename DeviceContext, typename T>
template <typename T>
class HashKerel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& context) const {
auto* out_t = context.Output<framework::LoDTensor>("Out");
auto* in_t = context.Input<framework::LoDTensor>("X");
int mod_by = context.Attr<int>("mod_by");
int num_hash = context.Attr<int>("num_hash");
auto* output = out_t->mutable_data<T>(context.GetPlace());
auto in_dims = in_t->dims();
auto in_lod = in_t->lod();
PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
"The actual input data's size mismatched with LoD information.");
auto seq_length = in_dims[0];
auto last_dim = in_dims[in_dims.size() - 1];
auto* input = in_t->data<T>();
for (int idx = 0; idx < seq_length; ++idx) {
for (int ihash = 0; ihash != num_hash; ++ihash) {
output[idx * num_hash + ihash] =
XXH64(input, sizeof(int) * last_dim, ihash) % mod_by;
}
input += last_dim;
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lars_momentum_op.h"
#include "paddle/fluid/operators/momentum_op.h"
namespace paddle {
namespace operators {
class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Param",
"(LoDTensor, default LoDTensor<float>) "
"Input parameter that has to be updated");
AddInput("Grad",
"(LoDTensor, default LoDTensor<float>) "
"Input gradient of the parameter");
AddInput("Velocity",
"(LoDTensor, default LoDTensor<float>) "
"Input velocity (corresponding to the parameter) "
"that has to be updated");
AddInput("LearningRate",
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate");
AddOutput("ParamOut",
"(LoDTensor) This output is updated parameter. "
"It shared memory with Input(Param).");
AddOutput("VelocityOut",
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity).");
AddAttr<float>("mu", "(float) Momentum coefficient");
AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
.SetDefault(0.001);
AddAttr<float>("lars_weight_decay",
"(float, default 0.0005) LARS weight decay")
.SetDefault(0.0005);
AddComment(R"DOC(
Lars Momentum Optimizer.
This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
weight using a local learning rate:
$$
local\_lr = \eta *
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
velocity = mu * velocity +
local\_lr * (grad + \beta * param) \\
param = param - velocity. \\
$$
Note that we use lars_weight_decay here to decay weights, you may need not to
use L2 regularizers in case of using LARS.
)DOC");
}
};
class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::LarsMomentumOpVarTypeInference);
REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
ops::LarsMomentumOpKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lars_momentum_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out) {
T lr = learning_rate[0];
T local_lr = learning_rate[0];
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
if (p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]);
}
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new;
p_out[i] = p[i] - v_new;
}
}
template <typename DeviceContext, typename T>
class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
auto param = ctx.Input<framework::LoDTensor>("Param");
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
auto grad = ctx.Input<framework::LoDTensor>("Grad");
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
auto* p = param->data<T>();
auto* v = velocity->data<T>();
auto* g = grad->data<T>();
auto* lr = learning_rate->data<T>();
int block = 512;
int grid = (param->numel() + block - 1) / block;
auto eigen_p = framework::EigenVector<T>::Flatten(*param);
auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
// calculate norms using eigein and launch the kernel.
framework::Tensor p_norm_t, g_norm_t;
p_norm_t.Resize({1});
g_norm_t.Resize({1});
auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
ep_norm.device(*place) = eigen_p.square().sum().sqrt();
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
lars_momentum,
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename T>
class LarsMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
auto param = ctx.Input<framework::LoDTensor>("Param");
auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
auto* grad_var = ctx.InputVar("Grad");
// only support dense for now.
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>());
auto grad = ctx.Input<framework::LoDTensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace());
velocity_out->mutable_data<T>(ctx.GetPlace());
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
auto p_out = framework::EigenVector<T>::Flatten(*param_out);
auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
auto p = framework::EigenVector<T>::Flatten(*param);
auto v = framework::EigenVector<T>::Flatten(*velocity);
auto g = framework::EigenVector<T>::Flatten(*grad);
auto* lr = learning_rate->data<T>();
framework::Tensor p_norm_t, g_norm_t;
p_norm_t.Resize({1});
g_norm_t.Resize({1});
p_norm_t.mutable_data<T>(ctx.GetPlace());
g_norm_t.mutable_data<T>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
ep_norm = p.square().sum().sqrt();
eg_norm = g.square().sum().sqrt();
T local_lr = lr[0];
if (ep_norm(0) > 0 && eg_norm(0) > 0) {
local_lr = lr[0] * lars_coeff * ep_norm(0) /
(eg_norm(0) + lars_weight_decay * ep_norm(0));
}
v_out = v * mu + local_lr * (g + lars_weight_decay * p);
p_out = p - v_out;
}
};
} // namespace operators
} // namespace paddle
...@@ -27,6 +27,10 @@ limitations under the License. */ ...@@ -27,6 +27,10 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -332,11 +336,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -332,11 +336,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
sync_mode, checkpoint_block_id)); sync_mode, checkpoint_block_id));
rpc_service_->RegisterRPC(distributed::kRequestSend, rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get()); request_send_handler_.get(),
FLAGS_rpc_send_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestGet, rpc_service_->RegisterRPC(distributed::kRequestGet,
request_get_handler_.get()); request_get_handler_.get(),
FLAGS_rpc_get_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestPrefetch, rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
request_prefetch_handler_.get()); request_prefetch_handler_.get(),
FLAGS_rpc_prefetch_thread_num);
rpc_service_->RegisterRPC(distributed::kRequestCheckpoint, rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
request_checkpoint_handler_.get()); request_checkpoint_handler_.get());
......
...@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise the given value indicates padding the output " "Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.") "with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding); .SetDefault(kNoPadding);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr<bool>("grad_inplace",
"(boolean, default false) "
"If the grad op reuse the input's variable.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Lookup Table Operator. Lookup Table Operator.
...@@ -115,7 +121,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -115,7 +121,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out"));
return framework::OpKernelType(data_type, ctx.device_context()); return framework::OpKernelType(data_type, ctx.device_context());
} }
}; };
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
const auto *table = table_t.value().data<T>(); const auto *table = table_t.value().data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace()); auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != kNoPadding && ids[i] == padding_idx) { if (padding_idx != kNoPadding && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
...@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0);
auto id_index = table_t.Index(ids[i]); auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
memcpy(output + i * row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
row_width * sizeof(T)); output + i * row_width);
} }
} }
} }
...@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
framework::Vector<int64_t> new_rows; std::vector<int64_t> new_rows;
new_rows.reserve(ids_num); new_rows.resize(ids_num);
for (int64_t i = 0; i < ids_num; i++) { std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
new_rows.push_back(ids_data[i]);
}
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]}); d_table_value->Resize({ids_num, table_dim[1]});
d_table_value->mutable_data<T>(context.GetPlace()); // FIXME(minqiyang):
// memory optimization will NOT reuse Tensor with SelectedRows
d_table->set_height(table_dim[0]); // so we could just share the tensor here directly.
// However, the InferVarType method will infer the output SelectedRows
auto *d_output_data = d_output->data<T>(); // to Tensor sometimes, which is a bug, so we will add an attribute
auto *d_table_data = d_table_value->data<T>(); // here to indicate the inplace and remove this attribute after
// the InferVarType's bug was fixed
auto d_output_dims = d_output->dims(); bool grad_inplace = context.Attr<bool>("grad_inplace");
PADDLE_ENFORCE_EQ( if (grad_inplace) {
d_table_value->dims(), d_table_value->ShareDataWith(*d_output);
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); } else {
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); d_table_value->mutable_data<T>(context.GetPlace());
d_table->set_height(table_dim[0]);
auto *d_output_data = d_output->data<T>();
auto *d_table_data = d_table_value->data<T>();
auto d_output_dims = d_output->dims();
PADDLE_ENFORCE_EQ(
d_table_value->dims(),
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
}
} else { } else {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
......
...@@ -76,6 +76,6 @@ endif() ...@@ -76,6 +76,6 @@ endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
cc_library(jit_kernel cc_library(jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas) DEPS cpu_info cblas)
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
...@@ -39,6 +39,52 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { ...@@ -39,6 +39,52 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
return -1; return -1;
} }
template <typename T>
HOSTDEVICE inline size_t LowerBound(const T *x, size_t num, const T &val) {
#ifdef __CUDA_ARCH__
// The following code is from
// https://en.cppreference.com/w/cpp/algorithm/lower_bound
auto *first = x;
int64_t count = static_cast<int64_t>(num);
while (count > 0) {
int64_t step = (count >> 1);
auto *it = first + step;
if (*it < val) {
first = ++it;
count -= (step + 1);
} else {
count = step;
}
}
return static_cast<size_t>(first - x);
#else
return static_cast<size_t>(std::lower_bound(x, x + num, val) - x);
#endif
}
template <typename T>
HOSTDEVICE inline size_t UpperBound(const T *x, size_t num, const T &val) {
#ifdef __CUDA_ARCH__
// The following code is from
// https://en.cppreference.com/w/cpp/algorithm/upper_bound
auto *first = x;
int64_t count = static_cast<int64_t>(num);
while (count > 0) {
auto step = (count >> 1);
auto *it = first + step;
if (val < *it) {
count = step;
} else {
first = ++it;
count -= (step + 1);
}
}
return static_cast<size_t>(first - x);
#else
return static_cast<size_t>(std::upper_bound(x, x + num, val) - x);
#endif
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -151,6 +151,13 @@ class GRUKernel : public Kernel { ...@@ -151,6 +151,13 @@ class GRUKernel : public Kernel {
virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
}; };
template <typename T>
class CRFDecodeKernel : public Kernel {
public:
virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
int *track) const = 0;
};
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <limits>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace paddle {
namespace operators {
namespace math {
namespace jitkernel {
namespace jit = platform::jit;
/* CRF Decode JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
public:
explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
this->num_ = tag_num;
}
void Compute(const int seq_len, const T* x, const T* w, T* alpha,
int* track) const override {
constexpr int state_trans_base_idx = 2;
for (int i = 0; i < this->num_; ++i) {
alpha[i] = w[i] + x[i];
}
for (int k = 1; k < seq_len; ++k) {
for (int i = 0; i < this->num_; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (int j = 0; j < this->num_; ++j) {
T score = alpha[(k - 1) * this->num_ + j] +
w[(j + state_trans_base_idx) * this->num_ + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha[k * this->num_ + i] = max_score + x[k * this->num_ + i];
track[k * this->num_ + i] = max_j;
}
}
}
};
#define INIT_ALPHA(step_size) \
/* Setup the alpha initial value.*/ \
int i_offset = 0; \
int last_offset = this->rest_ - step_size; \
for (int i = 0; i <= this->end_; ++i) { \
/* weights, input and alpha values. */ \
__m256 w_content, x_content, alpha_content; \
/* Load the relevant data into the variables from un-aligned address.*/ \
w_content = _mm256_loadu_ps(w + i_offset); \
x_content = _mm256_loadu_ps(x + i_offset); \
alpha_content = _mm256_add_ps(w_content, x_content); \
_mm256_storeu_ps(alpha + i_offset, alpha_content); \
i_offset += step_size; \
if (i == this->end_ - 1) { \
if (this->rest_ > 0) { \
i_offset += last_offset; \
} else { \
break; \
} \
} \
}
#define UPDATE_ALPHA(step_size) \
/* Update the alpha and track values. */ \
__m256 x_content = _mm256_loadu_ps(x + seq_offset + this->num_ + j_offset); \
max_score = _mm256_add_ps(max_score, x_content); \
_mm256_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); \
_mm256_storeu_si256( \
reinterpret_cast<__m256i*>(track + seq_offset + this->num_ + j_offset), \
max_j); \
/* Calculate the offset of next step*/ \
j_offset += step_size; \
if (j == this->end_ - 1) { \
if (this->rest_ > 0) { \
j_offset += last_offset; \
} else { \
break; \
} \
}
#define INTRIAVX_FLOAT(block) \
template <> \
CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl( \
int tag_num) \
: CRFDecodeKernel<float>() { \
this->num_ = tag_num; \
this->end_ = this->num_ / AVX_FLOAT_BLOCK; \
this->rest_ = this->num_ % AVX_FLOAT_BLOCK; \
} \
template <> \
void CRFDecodeKernelImpl<float, jit::avx, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \
INIT_ALPHA(AVX_FLOAT_BLOCK) \
/* Use the column-major strategy to get the location of maximum score.*/ \
int seq_offset = 0; \
constexpr int state_trans_base_idx = 2; \
for (int k = 1; k < seq_len; ++k) { \
int j_offset = 0; \
for (int j = 0; j <= this->end_; ++j) { \
/* Initialize the variables of maximum score and location.*/ \
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
__m256i max_j = _mm256_set1_epi32(0); \
/* Calculate the offset of transition_weights.*/ \
int trans_offset = state_trans_base_idx * this->num_ + j_offset; \
for (int i = 0; i < this->num_; ++i) { \
/* Initalize the content of alpha variable with related offset.*/ \
__m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); \
/* Obtain the content of weights from un-aligned address.*/ \
__m256 w_content = _mm256_loadu_ps(w + trans_offset); \
__m256 score_v = _mm256_add_ps(alpha_content, w_content); \
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); \
/* According to the mask value, update the index of the max_score.*/ \
/* AVX instructions.*/ \
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \
hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); \
lo_max_j = _mm_or_si128(lo_mask, lo_max_j); \
hi_max_j = _mm_or_si128(hi_mask, hi_max_j); \
max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); \
max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); \
/* AVX done*/ \
/* Update the max_score value.*/ \
max_score = _mm256_max_ps(max_score, score_v); \
trans_offset += this->num_; \
} \
UPDATE_ALPHA(AVX_FLOAT_BLOCK) \
} \
seq_offset += this->num_; \
} \
}
#define INTRIAVX2_FLOAT(isa, block) \
template <> \
CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num) \
: CRFDecodeKernel<float>() { \
this->num_ = tag_num; \
this->end_ = this->num_ / AVX2_FLOAT_BLOCK; \
this->rest_ = this->num_ % AVX2_FLOAT_BLOCK; \
} \
template <> \
void CRFDecodeKernelImpl<float, isa, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \
INIT_ALPHA(AVX2_FLOAT_BLOCK) \
/* Use the column-major strategy to get the location of maximum score.*/ \
int seq_offset = 0; \
constexpr int state_trans_base_idx = 2; \
for (int k = 1; k < seq_len; ++k) { \
int j_offset = 0; \
for (int j = 0; j <= this->end_; ++j) { \
/* Initialize the variables of maximum score and location.*/ \
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
__m256i max_j = _mm256_set1_epi32(0); \
/* Calculate the offset of transition_weights.*/ \
int trans_offset = state_trans_base_idx * this->num_ + j_offset; \
for (int i = 0; i < this->num_; ++i) { \
/* Initalize the content of alpha variable with related offset.*/ \
__m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); \
/* Obtain the content of weights from un-aligned address.*/ \
__m256 w_content = _mm256_loadu_ps(w + trans_offset); \
__m256 score_v = _mm256_add_ps(alpha_content, w_content); \
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); \
/* According to the mask value, update the index of the max_score.*/ \
/* AVX2 instructions.*/ \
max_j = _mm256_or_si256( \
_mm256_andnot_si256((__m256i)mask, max_j), \
_mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); \
/* Update the max_score value.*/ \
max_score = _mm256_max_ps(max_score, score_v); \
trans_offset += this->num_; \
} \
UPDATE_ALPHA(AVX2_FLOAT_BLOCK) \
} \
seq_offset += this->num_; \
} \
}
#define INTRIAVX512_FLOAT(block) \
template <> \
CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl( \
int tag_num) \
: CRFDecodeKernel<float>() { \
this->num_ = tag_num; \
this->end_ = this->num_ / AVX512_FLOAT_BLOCK; \
this->rest_ = this->num_ % AVX512_FLOAT_BLOCK; \
} \
template <> \
void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \
INIT_ALPHA(AVX512_FLOAT_BLOCK) \
/* Use the column-major strategy to get the location of maximum score.*/ \
int seq_offset = 0; \
constexpr int state_trans_base_idx = 2; \
for (int k = 1; k < seq_len; ++k) { \
int j_offset = 0; \
for (int j = 0; j <= this->end_; ++j) { \
/* Initialize the variables of maximum score and location.*/ \
__m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max()); \
__m512i max_j = _mm512_setzero_si512(); \
/* Calculate the offset of transition_weights.*/ \
int trans_offset = state_trans_base_idx * this->num_ + j_offset; \
for (int i = 0; i < this->num_; ++i) { \
/* Initalize the content of alpha variable with related offset.*/ \
__m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); \
/* Obtain the content of weights from un-aligned address.*/ \
__m512 w_content = _mm512_loadu_ps(w + trans_offset); \
__m512 score_v = _mm512_add_ps(alpha_content, w_content); \
__mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); \
/* AVX512 instructions.*/ \
max_j = _mm512_mask_set1_epi32(max_j, mask, i); \
/* Update the max_score value.*/ \
max_score = _mm512_max_ps(max_score, score_v); \
trans_offset += this->num_; \
} \
/* Update the alpha and track values.*/ \
__m512 x_content = \
_mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); \
max_score = _mm512_add_ps(max_score, x_content); \
_mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, \
max_score); \
_mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + \
this->num_ + j_offset), \
max_j); \
/* Calculate the offset of next step*/ \
j_offset += AVX512_FLOAT_BLOCK; \
if (j == this->end_ - 1) { \
if (this->rest_ > 0) { \
j_offset += last_offset; \
} else { \
break; \
} \
} \
} \
seq_offset += this->num_; \
} \
}
#ifdef __AVX__
INTRIAVX_FLOAT(kEQ8);
INTRIAVX_FLOAT(kGT8LT16);
INTRIAVX_FLOAT(kEQ16);
INTRIAVX_FLOAT(kGT16);
#endif
#ifdef __AVX2__
INTRIAVX2_FLOAT(jit::avx2, kEQ8);
INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
INTRIAVX2_FLOAT(jit::avx2, kEQ16);
INTRIAVX2_FLOAT(jit::avx2, kGT16);
#endif
#ifdef __AVX512F__
INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
INTRIAVX512_FLOAT(kEQ16);
INTRIAVX512_FLOAT(kGT16);
#endif
#undef INTRIAVX512_FLOAT
#undef INTRIAVX2_FLOAT
#undef INTRIAVX_FLOAT
#undef INIT_ALPHA
#undef UPDATE_ALPHA
REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel);
} // namespace jitkernel
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <map>
#include <set> #include <set>
#include <vector> #include <unordered_map>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
...@@ -230,8 +229,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>; ...@@ -230,8 +229,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
// add or mul. // add or mul.
namespace scatter { namespace scatter {
size_t FindPos(const std::vector<int64_t>& rows, int64_t value) { template <typename DeviceContext, typename T>
return std::find(rows.begin(), rows.end(), value) - rows.begin(); typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
blas->AXPY(data_len, 1., in, out);
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
for (int64_t i = 0; i < data_len; i++) {
out[i] += in[i];
}
} }
template <typename T> template <typename T>
...@@ -246,48 +261,84 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -246,48 +261,84 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output; std::vector<const framework::SelectedRows*> inputs;
std::vector<int64_t> input_rows(input.rows()); inputs.push_back(&input);
(*this)(context, inputs, output);
}
std::map<int64_t, std::vector<int64_t>> merge_row_map; void operator()(const platform::CPUDeviceContext& context,
for (size_t i = 0; i < input_rows.size(); ++i) { const std::vector<const framework::SelectedRows*>& inputs,
merge_row_map[input_rows[i]].push_back(i); framework::SelectedRows* output) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
return;
} }
const framework::SelectedRows* has_value_input = nullptr;
std::vector<int64_t> merge_rows(merge_row_map.size()); for (auto* in : inputs) {
size_t idx = 0; if (in->rows().size() > 0) {
int64_t input_width = input.value().dims()[1]; has_value_input = in;
out.set_height(input.height()); break;
}
T* out_data = out.mutable_value()->mutable_data<T>( }
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
auto input_height = has_value_input->height();
framework::SelectedRows& out = *output;
std::set<int64_t> merged_row_set;
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end());
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
out.set_rows(merge_rows);
out.set_height(input_height);
out.mutable_value()->mutable_data<T>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace()); context.GetPlace());
const T* in_data = input.value().data<T>();
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
for (auto& row_pair : merge_row_map) { constant_functor(context, out.mutable_value(), 0.0);
auto* out_ptr = out_data + idx * input_width;
auto& rows = row_pair.second; auto* out_data = out.mutable_value()->data<T>();
merge_rows[idx] = row_pair.first;
++idx; auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
// rows.size() is always larger than 0 for (auto* input : inputs) {
std::memcpy(out_ptr, in_data + rows[0] * input_width, if (input->rows().size() == 0) {
sizeof(T) * input_width); continue;
}
for (size_t i = 1; i < rows.size(); ++i) { auto* input_data = input->value().data<T>();
auto* in_ptr = in_data + rows[i] * input_width; auto& input_rows = input->rows();
for (int64_t j = 0; j < input_width; ++j) {
out_ptr[j] += in_ptr[j]; for (size_t i = 0; i < input_rows.size(); i++) {
} size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
} }
} }
out.set_rows(merge_rows);
} }
}; };
template struct MergeAdd<platform::CPUDeviceContext, int>; template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>; template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template <typename T> template <typename T>
struct UpdateToTensor<platform::CPUDeviceContext, T> { struct UpdateToTensor<platform::CPUDeviceContext, T> {
......
...@@ -267,10 +267,15 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -267,10 +267,15 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output) { framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
framework::Vector<int64_t> input_rows(input.rows()); framework::Vector<int64_t> input_rows(input.rows());
if (input_rows.size() == 0) {
return;
}
framework::SelectedRows& out = *output;
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows_cpu(row_set.begin(), row_set.end());
framework::Vector<int64_t> merge_rows(merge_rows_cpu);
auto input_width = input.value().dims()[1]; auto input_width = input.value().dims()[1];
...@@ -296,6 +301,73 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -296,6 +301,73 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width); out.rows().size(), input_width);
} }
void operator()(const platform::CUDADeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs,
framework::SelectedRows* output) {
if (inputs.size() == 0) {
VLOG(3) << "no input! return";
return;
}
const framework::SelectedRows* has_value_input = nullptr;
for (auto* in : inputs) {
if (in->rows().size() > 0) {
has_value_input = in;
break;
}
}
if (has_value_input == nullptr) {
VLOG(3) << "no input has value! just return" << std::endl;
return;
}
auto input_width = has_value_input->value().dims()[1];
auto input_height = has_value_input->height();
framework::SelectedRows& out = *output;
std::set<int64_t> merged_row_set;
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
"all input should have same "
"dimension except for the first one");
PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height");
merged_row_set.insert(input->rows().begin(), input->rows().end());
}
std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
merged_row_set.end());
framework::Vector<int64_t> merge_rows(merge_rows_cpu);
out.set_rows(merge_rows);
out.set_height(input_height);
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
}
auto* input_data = input->value().data<T>();
auto& input_rows = input->rows();
dim3 grid1(input_rows.size(), 1);
MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
input_data, input_rows.CUDAData(context.GetPlace()), out_data,
out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width);
}
}
}; };
template struct MergeAdd<platform::CUDADeviceContext, float>; template struct MergeAdd<platform::CUDADeviceContext, float>;
......
...@@ -83,104 +83,9 @@ struct MergeAdd { ...@@ -83,104 +83,9 @@ struct MergeAdd {
void operator()(const DeviceContext& context, void operator()(const DeviceContext& context,
const framework::SelectedRows& input, const framework::SelectedRows& input,
framework::SelectedRows* output); framework::SelectedRows* output);
}; void operator()(const DeviceContext& context,
const std::vector<const framework::SelectedRows*>& inputs,
template <> framework::SelectedRows* output);
struct MergeAdd<platform::CPUDeviceContext, float> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
std::vector<int64_t> input_rows(input.rows());
std::map<int64_t, std::vector<int64_t>> merge_row_map;
for (size_t i = 0; i < input_rows.size(); ++i) {
merge_row_map[input_rows[i]].push_back(i);
}
std::vector<int64_t> merge_rows(merge_row_map.size());
size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height());
auto* out_data = out.mutable_value()->mutable_data<float>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
auto* in_data = input.value().data<float>();
auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto& rows = row_pair.second;
merge_rows[idx] = row_pair.first;
++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
}
out.set_rows(merge_rows);
}
};
template <>
struct MergeAdd<platform::CPUDeviceContext, double> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
std::vector<int64_t> input_rows(input.rows());
std::map<int64_t, std::vector<int64_t>> merge_row_map;
for (size_t i = 0; i < input_rows.size(); ++i) {
merge_row_map[input_rows[i]].push_back(i);
}
std::vector<int64_t> merge_rows(merge_row_map.size());
size_t idx = 0;
int64_t input_width = input.value().dims()[1];
out.set_height(input.height());
auto* out_data = out.mutable_value()->mutable_data<double>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
auto* in_data = input.value().data<double>();
auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
for (auto& row_pair : merge_row_map) {
auto* out_ptr = out_data + idx * input_width;
auto& rows = row_pair.second;
merge_rows[idx] = row_pair.first;
++idx;
// rows.size() is always larger than 0
blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
for (size_t i = 1; i < rows.size(); ++i) {
blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
}
}
out.set_rows(merge_rows);
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -302,6 +302,64 @@ TEST(selected_rows_functor, cpu_merge_add_int) { ...@@ -302,6 +302,64 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
EXPECT_EQ(out_data[1 * row_numel], 2); EXPECT_EQ(out_data[1 * row_numel], 2);
EXPECT_EQ(out_data[2 * row_numel], 1); EXPECT_EQ(out_data[2 * row_numel], 1);
} }
TEST(selected_rows_functor, cpu_merge_add_multi) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
set_const;
int64_t height = 10;
int64_t row_numel = 8;
std::vector<int64_t> rows1{5, 2, 5, 3, 5};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
cpu_place);
set_const(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{2, 5, 3, 5, 3};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
cpu_place);
set_const(ctx, in2_value, 1.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height);
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
float>
merge_add_functor;
std::vector<const paddle::framework::SelectedRows*> inputs;
inputs.push_back(selected_rows1.get());
inputs.push_back(selected_rows2.get());
merge_add_functor(ctx, inputs, output.get());
EXPECT_EQ(output->height(), height);
EXPECT_EQ(output->value().dims(),
paddle::framework::make_ddim({3, row_numel}));
std::vector<int64_t> ret_rows{2, 3, 5};
EXPECT_EQ(output->rows(), ret_rows);
auto* out_data = output->value().data<float>();
for (size_t i = 0; i < ret_rows.size(); ++i) {
for (size_t j = 0; j < row_numel; ++j) {
EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
}
}
}
TEST(selected_rows_functor, cpu_sum_to) { TEST(selected_rows_functor, cpu_sum_to) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place); paddle::platform::CPUDeviceContext ctx(cpu_place);
...@@ -318,6 +376,7 @@ TEST(selected_rows_functor, cpu_sum_to) { ...@@ -318,6 +376,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
paddle::framework::make_ddim( paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}), {static_cast<int64_t>(rows1.size()), row_numel}),
cpu_place); cpu_place);
functor(ctx, in1_value, 1.0); functor(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 5, 7, 9}; std::vector<int64_t> rows2{0, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{ std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
...@@ -327,6 +386,7 @@ TEST(selected_rows_functor, cpu_sum_to) { ...@@ -327,6 +386,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
paddle::framework::make_ddim( paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}), {static_cast<int64_t>(rows2.size()), row_numel}),
cpu_place); cpu_place);
functor(ctx, in2_value, 2.0); functor(ctx, in2_value, 2.0);
std::unique_ptr<paddle::framework::SelectedRows> output{ std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()}; new paddle::framework::SelectedRows()};
......
...@@ -241,3 +241,67 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -241,3 +241,67 @@ TEST(selected_rows_functor, gpu_add_to) {
// row9: 2.0 + 3.0 // row9: 2.0 + 3.0
EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0); EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
} }
TEST(selected_rows_functor, gpu_merge_add) {
paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float>
set_const;
int64_t height = 10;
int64_t row_numel = 8;
std::vector<int64_t> rows1{5, 2, 5, 3, 5};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
gpu_place);
set_const(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{2, 5, 3, 5, 3};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
gpu_place);
set_const(ctx, in2_value, 1.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height);
paddle::operators::math::scatter::MergeAdd<
paddle::platform::CUDADeviceContext, float>
merge_add_functor;
std::vector<const paddle::framework::SelectedRows*> inputs;
inputs.push_back(selected_rows1.get());
inputs.push_back(selected_rows2.get());
merge_add_functor(ctx, inputs, output.get());
paddle::framework::Tensor output_cpu;
paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
ctx.Wait();
EXPECT_EQ(output->height(), height);
EXPECT_EQ(output->value().dims(),
paddle::framework::make_ddim({3, row_numel}));
std::vector<int64_t> ret_rows{2, 3, 5};
EXPECT_EQ(output->rows(), ret_rows);
auto* out_data = output_cpu.data<float>();
for (size_t i = 0; i < ret_rows.size(); ++i) {
for (size_t j = 0; j < row_numel; ++j) {
EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
}
}
}
...@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T, bool is_test>
class MaxSeqPoolFunctor { class MaxSeqPoolFunctor {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
...@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor { ...@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
} }
} }
}; };
// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
// index buffer
template <typename T>
class MaxSeqPoolFunctor<T, true> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
dim * sizeof(T));
for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
for (int64_t k = 0; k < dim; ++k) {
if (in_data[j * dim + k] > out_data[i * dim + k]) {
out_data[i * dim + k] = in_data[j * dim + k];
}
}
}
}
}
};
template <typename T> template <typename T>
class MaxSeqPoolGradFunctor { class MaxSeqPoolGradFunctor {
public: public:
...@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
/* max pool has index output */ /* max pool has index output */
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input, const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output, framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) { framework::Tensor* index = nullptr) {
if (pooltype == "MAX") { if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<T> max_pool; if (is_test) {
max_pool(context, input, output, index); math::MaxSeqPoolFunctor<T, true> max_pool;
max_pool(context, input, output, index);
} else {
math::MaxSeqPoolFunctor<T, false> max_pool;
max_pool(context, input, output, index);
}
return; return;
} }
if (pooltype == "LAST") { if (pooltype == "LAST") {
...@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool(context, input, output); last_pool(context, input, output);
return; return;
} }
if (pooltype == "FIRST") { if (pooltype == "FIRST") {
math::FirstSeqPoolFunctor<T> first_pool; math::FirstSeqPoolFunctor<T> first_pool;
first_pool(context, input, output); first_pool(context, input, output);
......
...@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> { ...@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const std::string pooltype, const framework::LoDTensor& input, const std::string pooltype, const framework::LoDTensor& input,
framework::Tensor* output, framework::Tensor* output, bool is_test,
framework::Tensor* index = nullptr) { framework::Tensor* index = nullptr) {
auto& lod = input.lod()[0]; auto& lod = input.lod()[0];
const size_t item_dim = output->numel() / output->dims()[0]; const size_t item_dim = output->numel() / output->dims()[0];
......
...@@ -28,7 +28,7 @@ class SequencePoolFunctor { ...@@ -28,7 +28,7 @@ class SequencePoolFunctor {
/* max pool has index output */ /* max pool has index output */
void operator()(const DeviceContext& context, const std::string pooltype, void operator()(const DeviceContext& context, const std::string pooltype,
const framework::LoDTensor& input, framework::Tensor* output, const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index = nullptr); bool is_test = false, framework::Tensor* index = nullptr);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -20,13 +20,16 @@ namespace operators { ...@@ -20,13 +20,16 @@ namespace operators {
class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
AddInput( .AsDuplicable();
"X", AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
"(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " .AsDuplicable();
"size of embedding table") AddInput("X",
"(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
"size of embedding table")
.AsDuplicable();
AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
AddComment(R"DOC( AddComment(R"DOC(
Merge multi LoDTensor's into one according to Ids's shard num. Merge multi LoDTensor's into one according to Ids's shard num.
...@@ -79,15 +82,19 @@ class MergeIdsOp : public framework::OperatorWithKernel { ...@@ -79,15 +82,19 @@ class MergeIdsOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); PADDLE_ENFORCE(ctx->HasInputs("Ids"),
PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); "MergeIdsOp must has multi input Ids.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); PADDLE_ENFORCE(ctx->HasInputs("Rows"),
"MergeIdsOp must has multi input Rows.");
PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has multi input X.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"),
"MergeIdsOp must has multi output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front(); auto ids_var_type = ctx->GetInputsVarType("Ids").front();
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputsDim("Ids");
if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims.size(), 2); PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1); PADDLE_ENFORCE_EQ(ids_dims[0][1], 1);
} }
auto x_var_type = ctx->GetInputsVarType("X"); auto x_var_type = ctx->GetInputsVarType("X");
for (auto &var_type : x_var_type) { for (auto &var_type : x_var_type) {
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <tuple>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
...@@ -30,59 +32,70 @@ class MergeIdsOpKernel : public framework::OpKernel<T> { ...@@ -30,59 +32,70 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
if (!platform::is_cpu_place(place)) { if (!platform::is_cpu_place(place)) {
PADDLE_THROW("MergeIds do not support GPU kernel"); PADDLE_THROW("MergeIds do not support GPU kernel");
} }
VLOG(3) << "run in MergeIdsOpKernel";
const auto *ids_var = ctx.InputVar("Ids"); const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(), const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
"only support to merge Ids of LoDTensor"); const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const auto &ids_tensor = ids_var->Get<framework::LoDTensor>(); PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
const auto &ids_dims = ids_tensor.dims(); "the number of Rows and X should be the same");
const int64_t *ids = ids_tensor.data<int64_t>(); PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
"the number of Ids and Out should be the same");
auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X"); int row_ids_size = 0;
int row_size = 0;
int embedding_size = 0;
auto *out = ctx.Output<framework::LoDTensor>("Out"); for (int i = 0; i < x_tensors.size(); ++i) {
const auto *x_tensor = x_tensors[i];
const auto *row_id = row_ids[i];
int batch_size = 0; if (embedding_size == 0) {
int embedding_size = 0; embedding_size = x_tensor->dims()[1];
for (auto &input : x_tensors) {
if (framework::product(input->dims()) != 0) {
if (embedding_size == 0) {
embedding_size = input->dims()[1];
}
PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
"embedding size of all input should be the same");
batch_size += input->dims()[0];
} }
PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
"embedding size of all input should be the same");
row_size += x_tensor->dims()[0];
row_ids_size += row_id->dims()[0];
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
batch_size, ids_dims[0], row_size, row_ids_size,
"the batch size of ids and merged embedding value should be the same"); "the merged X dim[0] and merged Rows dim[0] should be the same");
std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
selected_rows_idx_map;
for (int i = 0; i < x_tensors.size(); ++i) {
const auto *row_id = row_ids[i];
for (int j = 0; j < row_id->numel(); ++j) {
int64_t key = row_id->data<int64_t>()[j];
std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
selected_rows_idx_map.insert(std::make_pair(key, val));
}
}
PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
"the rows and tensor map size should be the same");
for (int i = 0; i < outs.size(); ++i) {
auto *out_ids = ids[i];
auto *out = outs[i];
const size_t shard_num = x_tensors.size(); out->set_lod(out_ids->lod());
if (shard_num == 1) { int nums = static_cast<int>(out_ids->dims()[0]);
VLOG(3) << "only one shard, we can copy the data directly";
TensorCopy(*x_tensors[0], place, out);
} else {
std::vector<int> in_indexs(shard_num, 0);
auto *out_data = out->mutable_data<T>( auto *out_data = out->mutable_data<T>(
framework::make_ddim({batch_size, embedding_size}), place); framework::make_ddim({nums, embedding_size}), place);
// copy data from ins[shard_num] to out. for (int j = 0; j < nums; ++j) {
for (int i = 0; i < ids_dims[0]; ++i) { int id = out_ids->data<int64_t>()[j];
int64_t id = ids[i]; auto row_tuple = selected_rows_idx_map[id];
size_t shard_id = static_cast<size_t>(id) % shard_num; int64_t row_idx = std::get<1>(row_tuple);
int index = in_indexs[shard_id]; const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
memcpy(out_data + embedding_size * i,
x_tensors[shard_id]->data<T>() + index * embedding_size, memcpy(out_data + embedding_size * j,
x_tensor->data<T>() + row_idx * embedding_size,
sizeof(T) * embedding_size); sizeof(T) * embedding_size);
in_indexs[shard_id] += 1;
}
for (size_t i = 0; i < shard_num; ++i) {
PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
"after merge, all data in x_tensor should be used");
} }
} }
} }
......
...@@ -19,54 +19,6 @@ namespace operators { ...@@ -19,54 +19,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
class MomentumOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(param) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(grad) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Velocity"),
"Input(velocity) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of Momentum should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
"Output(VelocityOut) of Momentum should not be null.");
auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension.");
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension.");
}
PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
"Learning_rate should be a scalar");
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class MomentumOpInferVarType : public framework::VarTypeInference { class MomentumOpInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc& op_desc,
......
...@@ -28,6 +28,54 @@ using framework::SelectedRows; ...@@ -28,6 +28,54 @@ using framework::SelectedRows;
struct NoNesterov; struct NoNesterov;
struct UseNesterov; struct UseNesterov;
class MomentumOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(param) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(grad) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Velocity"),
"Input(velocity) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of Momentum should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of Momentum should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
"Output(VelocityOut) of Momentum should not be null.");
auto param_dim = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"Param and Grad input of MomentumOp should have the same dimension.");
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Velocity"),
"Param and Velocity of MomentumOp should have the same dimension.");
}
PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
"Learning_rate should be a scalar");
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
template <typename T> template <typename T>
class CPUDenseMomentumFunctor { class CPUDenseMomentumFunctor {
private: private:
......
...@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor<int>) This tensor is used for the sequence max-pooling " "(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes.") "to record the max indexes.")
.AsIntermediate(); .AsIntermediate();
AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"pooltype", "pooltype",
"(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
......
...@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<Tensor>("Out"); auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
Tensor* index = nullptr;
if (pooltype == "MAX") {
index = context.Output<Tensor>("MaxIndex");
}
auto dims = in->dims(); auto dims = in->dims();
auto lod = in->lod(); auto lod = in->lod();
...@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
dims[0] = lod[0].size() - 1; dims[0] = lod[0].size() - 1;
out->Resize({dims}); out->Resize({dims});
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") { Tensor* index = nullptr;
const bool is_test = context.Attr<bool>("is_test");
// Do not create index buffer for inference (is_test) mode
// TODO(jczaja): Skip index buffer creation for other devices eg. GPU
if (pooltype == "MAX" &&
(is_test == false ||
platform::is_cpu_place(context.GetPlace()) == false)) {
index = context.Output<Tensor>("MaxIndex");
index->Resize({dims}); index->Resize({dims});
index->mutable_data<int>(context.GetPlace()); index->mutable_data<int>(context.GetPlace());
} }
math::SequencePoolFunctor<DeviceContext, T> pool; math::SequencePoolFunctor<DeviceContext, T> pool;
pool(context.template device_context<DeviceContext>(), pooltype, *in, out, pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
index); is_test, index);
} }
}; };
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_reverse_op.h"
namespace ops = paddle::operators;
REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseOp,
ops::SequenceReverseOpMaker,
ops::SequenceReverseGradOpDescMaker);
REGISTER_OP_CPU_KERNEL(
sequence_reverse,
ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int>,
ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, double>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_reverse_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
sequence_reverse,
ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int>,
ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, double>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
class SequenceReverseOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dim.size(), 2,
"Rank of Input(X) must be not less than 2.");
ctx->SetOutputDim("Y", x_dim);
ctx->ShareLoD("X", "Y");
}
};
class SequenceReverseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input LoDTensor of sequence_reverse op.");
AddOutput("Y", "The output LoDTensor of sequence_reverse op.");
AddComment(R"DOC(
SequenceReverse Operator.
Reverse each sequence in input X along dim 0.
Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where:
X.data() = [
[1, 2, 3, 4],
[5, 6, 7, 8], # the 0-th sequence with length 2
[9, 10, 11, 12],
[13, 14, 15, 16],
[17, 18, 19, 20] # the 1-st sequence with length 3
]
The output Y would be a LoDTensor sharing the same dims and lod with input X,
and:
Y.data() = [
[5, 6, 7, 8],
[1, 2, 3, 4], # the reversed 0-th sequence with length 2
[17, 18, 19, 20],
[13, 14, 15, 16],
[9, 10, 11, 12] # the reversed 1-st sequence with length 3
]
This Operator is useful to build a reverse dynamic RNN network.
This Operator only supports one-level lod currently.
)DOC");
}
};
template <typename T>
struct SequenceReverseFunctor {
SequenceReverseFunctor(const T *x, T *y, const size_t *lod, size_t lod_count,
size_t row_numel)
: x_(x), y_(y), lod_(lod), lod_count_(lod_count), row_numel_(row_numel) {}
HOSTDEVICE void operator()(size_t idx_x) const {
auto row_idx_x = idx_x / row_numel_;
auto lod_idx = math::UpperBound(lod_, lod_count_, row_idx_x);
auto row_idx_y = lod_[lod_idx - 1] + (lod_[lod_idx] - 1 - row_idx_x);
auto idx_y = row_idx_y * row_numel_ + idx_x % row_numel_;
y_[idx_y] = x_[idx_x];
}
const T *x_;
T *y_;
const size_t *lod_;
size_t lod_count_;
size_t row_numel_;
};
template <typename DeviceContext, typename T>
class SequenceReverseOpKernel : public framework::OpKernel<T> {
using LoDTensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto &x = *ctx.Input<LoDTensor>("X");
auto *y = ctx.Output<LoDTensor>("Y");
PADDLE_ENFORCE_EQ(x.lod().size(), 1,
"SequenceReverse Op only support one level lod.");
auto &dev_ctx = ctx.template device_context<DeviceContext>();
const size_t *lod;
size_t lod_count = x.lod()[0].size();
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
lod = x.lod()[0].CUDAData(ctx.GetPlace());
} else {
#endif
lod = x.lod()[0].data();
#ifdef PADDLE_WITH_CUDA
}
#endif
size_t limit = static_cast<size_t>(x.numel());
size_t row_numel = static_cast<size_t>(limit / x.dims()[0]);
auto *x_data = x.data<T>();
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE_NE(x_data, y_data,
"SequenceReverse Op does not support in-place operation");
SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
row_numel);
platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
for_range(functor);
}
};
class SequenceReverseGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("sequence_reverse");
op->SetInput("X", OutputGrad("Y"));
op->SetOutput("Y", InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
...@@ -76,6 +76,8 @@ namespace ops = paddle::operators; ...@@ -76,6 +76,8 @@ namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>, ops::SoftmaxCUDNNKernel<float>,
ops::SoftmaxCUDNNKernel<double>,
ops::SoftmaxCUDNNKernel<plat::float16>); ops::SoftmaxCUDNNKernel<plat::float16>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>); ops::SoftmaxGradCUDNNKernel<float>,
ops::SoftmaxGradCUDNNKernel<double>);
...@@ -20,20 +20,27 @@ namespace operators { ...@@ -20,20 +20,27 @@ namespace operators {
class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker { class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
AddOutput("Out", "(LoDTensor) The outputs of the input Ids.") .AsDuplicable();
AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
.AsDuplicable(); .AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
Example: Example:
Input: Input:
X = [1,2,3,4,5,6] X = [[1,2,3,4,5,6],[2,3]]
Out(3 output): Out(3 output):
out0 = [3, 6] if compress is True:
out1 = [1, 4] out0 = [3, 3, 6]
out2 = [2, 5] out1 = [1, 4]
out2 = [2, 2, 5]
else:
out0 = [3, 6]
out1 = [1, 4]
out2 = [2, 5]
)DOC"); )DOC");
} }
}; };
...@@ -43,16 +50,24 @@ class SplitIdsOp : public framework::OperatorWithKernel { ...@@ -43,16 +50,24 @@ class SplitIdsOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids."); PADDLE_ENFORCE(ctx->HasInputs("Ids"), "SplitIdsOp must has input Ids.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front(); auto ids_var_type = ctx->GetInputsVarType("Ids").front();
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputsDim("Ids");
if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims.size(), 2); PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
} }
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.MultiInput<framework::Tensor>("Ids").front()->type()),
ctx.GetPlace());
}
}; };
class SplitIdsOpInferVarType : public framework::VarTypeInference { class SplitIdsOpInferVarType : public framework::VarTypeInference {
...@@ -66,12 +81,28 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { ...@@ -66,12 +81,28 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
} }
}; };
class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto grad = new framework::OpDesc();
grad->SetType("concat");
grad->SetInput("X", OutputGrad("Out"));
grad->SetOutput("Out", InputGrad("Ids"));
grad->SetAttr("axis", 0);
return std::unique_ptr<framework::OpDesc>(grad);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType); ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>, split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>); ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <iterator>
#include <set>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -31,19 +33,39 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -31,19 +33,39 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
PADDLE_THROW("SplitIds do not support GPU kernel"); PADDLE_THROW("SplitIds do not support GPU kernel");
} }
const auto *ids_var = ctx.InputVar("Ids"); const auto ids_vars = ctx.MultiInputVar("Ids");
PADDLE_ENFORCE_GT(ids_vars.size(), 0, "The number of Ids should > 0");
auto *ids_var = ids_vars[0];
if (ids_var->IsType<framework::LoDTensor>()) { if (ids_var->IsType<framework::LoDTensor>()) {
const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims(); int batch_size = 0;
const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>(); const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
for (size_t i = 0; i < ids_tensors.size(); ++i) {
batch_size += ids_tensors[i]->dims()[0];
}
VLOG(4) << "Get Total BatchSize is: " << batch_size;
std::vector<T> all_ids(batch_size);
int offset = 0;
for (size_t i = 0; i < ids_tensors.size(); ++i) {
const auto *ids = ids_tensors[i];
std::memcpy(all_ids.data() + offset, ids->data<T>(),
ids->numel() * sizeof(T));
offset += ids->numel();
}
std::set<T> st(all_ids.begin(), all_ids.end());
all_ids.assign(st.begin(), st.end());
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out"); auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const size_t shard_num = outs.size(); const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids; std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size()); out_ids.resize(outs.size());
// split id by their shard_num. // split id by their shard_num.
for (int i = 0; i < ids_dims[0]; ++i) { for (int i = 0; i < all_ids.size(); ++i) {
T id = ids[i]; T id = all_ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num; size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id); out_ids[shard_id].push_back(id);
} }
...@@ -64,7 +86,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -64,7 +86,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(ids_dims[0], PADDLE_ENFORCE_EQ(ids_dims[0],
static_cast<int64_t>(ids_selected_rows->rows().size()), static_cast<int64_t>(ids_selected_rows->rows().size()),
""); "");
const T *ids = ids_selected_rows->value().data<T>(); const T *ids_data = ids_selected_rows->value().data<T>();
const auto &ids_rows = ids_selected_rows->rows(); const auto &ids_rows = ids_selected_rows->rows();
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out"); auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
const size_t shard_num = outs.size(); const size_t shard_num = outs.size();
...@@ -87,7 +109,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> { ...@@ -87,7 +109,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
T *output = out->mutable_value()->mutable_data<T>(ddim, place); T *output = out->mutable_value()->mutable_data<T>(ddim, place);
for (int64_t i = 0; i < ddim[0]; ++i) { for (int64_t i = 0; i < ddim[0]; ++i) {
memcpy(output + i * row_width, memcpy(output + i * row_width,
ids + id_to_index[out->rows()[i]] * row_width, ids_data + id_to_index[out->rows()[i]] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
} }
......
...@@ -22,9 +22,9 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -22,9 +22,9 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("X", "The input SelectedRows."); AddInput("X", "The input SelectedRows.");
AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable(); AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
AddAttr<std::vector<int>>("height_sections", AddAttr<std::vector<int64_t>>("height_sections",
"Height for each output SelectedRows.") "Height for each output SelectedRows.")
.SetDefault(std::vector<int>({})); .SetDefault(std::vector<int64_t>({}));
AddComment(R"DOC( AddComment(R"DOC(
Split a SelectedRows with a specified rows section. Split a SelectedRows with a specified rows section.
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static int FindOutIdx(int row, const std::vector<int>& abs_sections) { static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
for (size_t i = 1; i < abs_sections.size(); ++i) { for (size_t i = 1; i < abs_sections.size(); ++i) {
if (row < abs_sections[i]) { if (row < abs_sections[i]) {
return i - 1; return i - 1;
...@@ -30,9 +30,9 @@ static int FindOutIdx(int row, const std::vector<int>& abs_sections) { ...@@ -30,9 +30,9 @@ static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
return abs_sections.size() - 1; return abs_sections.size() - 1;
} }
static std::vector<int> ToAbsoluteSection( static std::vector<int64_t> ToAbsoluteSection(
const std::vector<int>& height_sections) { const std::vector<int64_t>& height_sections) {
std::vector<int> abs_sections; std::vector<int64_t> abs_sections;
abs_sections.resize(height_sections.size()); abs_sections.resize(height_sections.size());
abs_sections[0] = 0; abs_sections[0] = 0;
for (size_t i = 1; i < height_sections.size(); ++i) { for (size_t i = 1; i < height_sections.size(); ++i) {
...@@ -47,7 +47,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::SelectedRows>("X"); auto* x = ctx.Input<framework::SelectedRows>("X");
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out"); auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
auto height_sections = ctx.Attr<std::vector<int>>("height_sections"); auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
auto abs_sections = ToAbsoluteSection(height_sections); auto abs_sections = ToAbsoluteSection(height_sections);
......
...@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X"); auto x_vars = ctx.MultiInputVar("X");
auto x_vars_name = ctx.Inputs("X");
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
framework::DataLayout layout{framework::DataLayout::kAnyLayout}; framework::DataLayout layout{framework::DataLayout::kAnyLayout};
...@@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel {
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
int dtype = -1; int dtype = -1;
for (auto& x_var : x_vars) { for (size_t idx = 0; idx < x_vars.size(); ++idx) {
auto& lod_tensor = x_var->Get<framework::LoDTensor>(); PADDLE_ENFORCE(x_vars[idx] != nullptr,
if (lod_tensor.numel() == 0) { "Input var[%s] should not be nullptr", x_vars_name[idx]);
// FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
auto tensor = framework::GetTensorFromVar(*x_vars[idx]);
if (tensor->numel() == 0) {
continue; continue;
} }
if (dtype == -1) { if (dtype == -1) {
dtype = framework::ToDataType(lod_tensor.type()); dtype = framework::ToDataType(tensor->type());
} else { } else {
PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
} }
} }
PADDLE_ENFORCE_NE(dtype, -1, PADDLE_ENFORCE_NE(dtype, -1,
......
...@@ -83,79 +83,54 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -83,79 +83,54 @@ class SumKernel : public framework::OpKernel<T> {
} }
} }
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
std::unique_ptr<framework::SelectedRows> in0; if (in_place && in_vars.size() < 2) {
if (in_place) { return;
// If is in_place, we store the input[0] to in0
auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
auto &rows = in_sel0.rows();
#ifdef PADDLE_WITH_CUDA
std::vector<int64_t> rows_in_cpu;
rows_in_cpu.reserve(rows.size());
for (auto item : rows) {
rows_in_cpu.push_back(item);
}
in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
#else
in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
#endif
in0->mutable_value()->ShareDataWith(in_sel0.value());
} }
auto get_selected_row = [&](size_t i) -> const SelectedRows & { std::vector<const paddle::framework::SelectedRows *> inputs;
if (i == 0 && in0) { SelectedRows temp_in0;
return *in0.get();
} else { if (in_place) {
return in_vars[i]->Get<SelectedRows>(); auto &in0 = in_vars[0]->Get<SelectedRows>();
temp_in0.set_height(in0.height());
temp_in0.set_rows(in0.rows());
framework::TensorCopy(in0.value(), in0.place(),
context.device_context(),
temp_in0.mutable_value());
inputs.push_back(&temp_in0);
for (size_t i = 1; i < in_vars.size(); ++i) {
auto &in = in_vars[i]->Get<SelectedRows>();
if (in.rows().size() > 0) {
inputs.push_back(&in);
}
}
} else {
for (auto &in_var : in_vars) {
auto &in = in_var->Get<SelectedRows>();
if (in.rows().size() > 0) {
inputs.push_back(&in_var->Get<SelectedRows>());
}
} }
}; }
auto *out = context.Output<SelectedRows>("Out"); auto *out = context.Output<SelectedRows>("Out");
out->mutable_rows()->clear(); out->mutable_rows()->clear();
auto *out_value = out->mutable_value();
// Runtime InferShape
size_t first_dim = 0;
for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
}
std::vector<int64_t> in_dim; bool has_data = false;
for (size_t i = 0; i < in_num; i++) { for (auto &in : inputs) {
auto &sel_row = get_selected_row(i); if (in->rows().size() > 0) {
if (sel_row.rows().size() > 0) { has_data = true;
in_dim = framework::vectorize(sel_row.value().dims());
break; break;
} }
} }
if (in_dim.empty()) { if (has_data) {
VLOG(3) << "WARNING: all the inputs are empty"; math::scatter::MergeAdd<DeviceContext, T> merge_add;
in_dim = merge_add(context.template device_context<DeviceContext>(), inputs,
framework::vectorize(get_selected_row(in_num - 1).value().dims()); out);
} else { } else {
in_dim[0] = static_cast<int64_t>(first_dim); // no data, just set a empty out tensor.
} out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
context.GetPlace());
out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to
// merge these vars.
if (first_dim == 0UL) {
return;
}
math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0;
for (size_t i = 0; i < in_num; i++) {
auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
functor(context.template device_context<DeviceContext>(), sel_row,
offset, out);
offset += sel_row.value().numel();
} }
} else if (out_var->IsType<framework::LoDTensorArray>()) { } else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>(); auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
......
...@@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, ...@@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>); transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker); ops::Transpose2GradMaker);
REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose2, transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>); ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose2_grad, transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -16,15 +16,18 @@ limitations under the License. */ ...@@ -16,15 +16,18 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose, transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose2, transpose2,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose2_grad, transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -29,7 +29,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = ctx.Attr<std::vector<int>>("shape"); auto shape = ctx.Attr<std::vector<int64_t>>("shape");
auto *selected_rows = out_var->GetMutable<framework::SelectedRows>(); auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
tensor = selected_rows->mutable_value(); tensor = selected_rows->mutable_value();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
...@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"), ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
"uniform_random's min must less then max"); "uniform_random's min must less then max");
auto &shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
std::vector<int64_t> temp; std::vector<int64_t> temp;
temp.reserve(shape.size()); temp.reserve(shape.size());
for (auto dim : shape) { for (auto dim : shape) {
...@@ -94,7 +94,7 @@ This operator initializes a tensor with random values sampled from a ...@@ -94,7 +94,7 @@ This operator initializes a tensor with random values sampled from a
uniform distribution. The random result is in set [min, max]. uniform distribution. The random result is in set [min, max].
)DOC"); )DOC");
AddAttr<std::vector<int>>("shape", "The shape of the output tensor"); AddAttr<std::vector<int64_t>>("shape", "The shape of the output tensor");
AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].") AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
.SetDefault(-1.0f); .SetDefault(-1.0f);
AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].") AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
......
...@@ -48,7 +48,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -48,7 +48,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
auto shape = context.Attr<std::vector<int>>("shape"); auto shape = context.Attr<std::vector<int64_t>>("shape");
tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value(); tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(shape)); tensor->Resize(framework::make_ddim(shape));
} else { } else {
......
...@@ -341,6 +341,28 @@ class ScopedPoolingDescriptor { ...@@ -341,6 +341,28 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
}; };
class ScopedSpatialTransformerDescriptor {
public:
ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
}
~ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
}
template <typename T>
inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
const int dimA[]) {
PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor(
desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
return desc_;
}
private:
cudnnSpatialTransformerDescriptor_t desc_;
DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
};
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
......
...@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { ...@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
"'Place' is not supported, Please re-compile with WITH_GPU " "'Place' is not supported, Please re-compile with WITH_GPU "
"option"); "option");
} }
return it->second.get(); return it->second.get().get();
} }
const std::vector<const DeviceContext*> template <typename DevCtx, typename PlaceType>
DeviceContextPool::GetAllDeviceContexts() const { inline void EmplaceDeviceContext(
std::vector<const DeviceContext*> all_device_ctx; std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
all_device_ctx.reserve(device_contexts_.size()); map_ptr,
for (auto& dev_ctx : device_contexts_) { platform::Place p) {
all_device_ctx.emplace_back(dev_ctx.second.get()); using PtrType = std::unique_ptr<DeviceContext>;
} map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
return all_device_ctx; // lazy evaluation. i.e., only create device context at
// first `Get`
return PtrType(new DevCtx(boost::get<PlaceType>(p)));
}));
} }
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
using PtrType = std::unique_ptr<DeviceContext>;
std::set<Place> set; std::set<Place> set;
for (auto& p : places) { for (auto& p : places) {
set.insert(p); set.insert(p);
...@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool( ...@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
for (auto& p : set) { for (auto& p : set) {
if (platform::is_cpu_place(p)) { if (platform::is_cpu_place(p)) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace( EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
#else #else
device_contexts_.emplace( EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
#endif #endif
} else if (platform::is_gpu_place(p)) { } else if (platform::is_gpu_place(p)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace( EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool( ...@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
#endif #endif
} else if (platform::is_cuda_pinned_place(p)) { } else if (platform::is_cuda_pinned_place(p)) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace( EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
p, &device_contexts_, p);
PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
#else #else
PADDLE_THROW( PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...@@ -296,38 +294,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } ...@@ -296,38 +294,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() { : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() {
p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>()); p_blobmap_.reset(new BlobMap());
p_mutex_.reset(new std::mutex());
}
namespace {
// Current thread's id.
thread_local int cur_thread_id = 0;
} }
void set_cur_thread_id(int tid) { cur_thread_id = tid; }
int get_cur_thread_id(void) { return cur_thread_id; }
void MKLDNNDeviceContext::SetBlob(const std::string& name, void MKLDNNDeviceContext::SetBlob(const std::string& name,
std::shared_ptr<void> data) const { std::shared_ptr<void> data) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p; BlobMap* pMap = p_blobmap_.get();
p = p_blobs_.get(); std::shared_ptr<KeyBlob> pBlob = nullptr;
int tid = platform::get_cur_thread_id();
std::lock_guard<std::mutex> lock(*p_mutex_.get());
auto it = p->find(name); // Find KeyBlob for current thread
auto map_it = pMap->find(tid);
if (it == p->end()) { if (map_it == pMap->end()) {
(*p)[name] = data; // create new blob // 1st time to set blob in current thread
pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
(*pMap)[tid] = pBlob;
} else { } else {
it->second = data; // set data to existing blob pBlob = map_it->second;
} }
// Find Key in found (or newly created) KeyBlob
auto key_it = pBlob->find(name);
if (key_it == pBlob->end()) {
(*pBlob)[name] = data; // create new blob
} else {
key_it->second = data; // set data to existing blob
}
// lock will be automatically released when out of scope
return; return;
} }
std::shared_ptr<void> MKLDNNDeviceContext::GetBlob( std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
const std::string& name) const { const std::string& name) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p; BlobMap* pMap = p_blobmap_.get();
p = p_blobs_.get(); std::shared_ptr<KeyBlob> pBlob = nullptr;
auto it = p->find(name); int tid = platform::get_cur_thread_id();
if (it != p->end()) { std::lock_guard<std::mutex> lock(*p_mutex_.get());
return it->second;
} // Find KeyBlob for current thread firstly
auto map_it = pMap->find(tid);
if (map_it == pMap->end()) return nullptr;
pBlob = map_it->second;
// Find Blob via name
auto key_it = pBlob->find(name);
if (key_it == pBlob->end()) return nullptr;
return nullptr; // lock will be automatically released when out of scope
return key_it->second;
} }
#endif #endif
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <future> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
...@@ -176,6 +177,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> { ...@@ -176,6 +177,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
void set_cur_thread_id(int);
int get_cur_thread_id(void);
class MKLDNNDeviceContext : public CPUDeviceContext { class MKLDNNDeviceContext : public CPUDeviceContext {
public: public:
explicit MKLDNNDeviceContext(CPUPlace place); explicit MKLDNNDeviceContext(CPUPlace place);
...@@ -191,8 +198,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -191,8 +198,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
private: private:
mkldnn::engine engine_; mkldnn::engine engine_;
std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>> std::shared_ptr<BlobMap> p_blobmap_;
p_blobs_; std::shared_ptr<std::mutex> p_mutex_;
}; };
#endif #endif
...@@ -217,9 +224,6 @@ class DeviceContextPool { ...@@ -217,9 +224,6 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
platform::DeviceContext* Get(const platform::Place& place); platform::DeviceContext* Get(const platform::Place& place);
/*! \brief Return all the device contexts. */
const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
template <typename Place> template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace( const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) { const Place& place) {
...@@ -231,7 +235,8 @@ class DeviceContextPool { ...@@ -231,7 +235,8 @@ class DeviceContextPool {
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_; std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
device_contexts_;
DISABLE_COPY_AND_ASSIGN(DeviceContextPool); DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
}; };
......
...@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed cudnn functions in HPPL * include all needed cudnn functions in HPPL
* different cudnn version has different interfaces * different cudnn version has different interfaces
**/ **/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \ #define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \ __macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \ __macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \ __macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \ __macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \ __macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \ __macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \ __macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \ __macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \ __macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \ __macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \ __macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \ __macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \ __macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \ __macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \ __macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \ __macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \ __macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \ __macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \ __macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \ __macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \ __macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \ __macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \ __macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreate); \ __macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnDestroy); \ __macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnSetStream); \ __macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnActivationForward); \ __macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnConvolutionForward); \ __macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnConvolutionBackwardBias); \ __macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \ __macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnTransformTensor); \ __macro(cudnnCreate); \
__macro(cudnnPoolingForward); \ __macro(cudnnDestroy); \
__macro(cudnnPoolingBackward); \ __macro(cudnnSetStream); \
__macro(cudnnSoftmaxBackward); \ __macro(cudnnActivationForward); \
__macro(cudnnSoftmaxForward); \ __macro(cudnnConvolutionForward); \
__macro(cudnnGetVersion); \ __macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
__macro(cudnnGetErrorString); __macro(cudnnGetErrorString);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
......
...@@ -57,6 +57,18 @@ struct variant_caster<V<Ts...>> { ...@@ -57,6 +57,18 @@ struct variant_caster<V<Ts...>> {
auto caster = make_caster<T>(); auto caster = make_caster<T>();
if (!load_success_ && caster.load(src, convert)) { if (!load_success_ && caster.load(src, convert)) {
load_success_ = true; load_success_ = true;
if (std::is_same<T, std::vector<float>>::value) {
auto caster_ints = make_caster<std::vector<int64_t>>();
if (caster_ints.load(src, convert)) {
VLOG(4) << "This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>";
value = cast_op<std::vector<int64_t>>(caster_ints);
return true;
}
}
value = cast_op<T>(caster); value = cast_op<T>(caster);
return true; return true;
} }
...@@ -259,6 +271,8 @@ void BindOpDesc(pybind11::module *m) { ...@@ -259,6 +271,8 @@ void BindOpDesc(pybind11::module *m) {
pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "") pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
.value("INT", pd::proto::AttrType::INT) .value("INT", pd::proto::AttrType::INT)
.value("INTS", pd::proto::AttrType::INTS) .value("INTS", pd::proto::AttrType::INTS)
.value("LONG", pd::proto::AttrType::LONG)
.value("LONGS", pd::proto::AttrType::LONGS)
.value("FLOAT", pd::proto::AttrType::FLOAT) .value("FLOAT", pd::proto::AttrType::FLOAT)
.value("FLOATS", pd::proto::AttrType::FLOATS) .value("FLOATS", pd::proto::AttrType::FLOATS)
.value("STRING", pd::proto::AttrType::STRING) .value("STRING", pd::proto::AttrType::STRING)
......
...@@ -645,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -645,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass"); py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init()) pass.def(py::init())
.def("set_str", [](ir::Pass &self, const std::string &name, .def(
const std::string &attr) { "set_str",
self.Set<std::string>(name, new std::string(attr)); [](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
})
.def("set_int", [](ir::Pass &self, const std::string &name, int val) {
self.Set<const int>(name, new int(val));
}); });
py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb( py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
......
...@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}") ...@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include") include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
...@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") ...@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
add_executable(demo_trainer demo_trainer.cc) add_executable(demo_trainer demo_trainer.cc)
...@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer ...@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer
${ARCHIVE_END} ${ARCHIVE_END}
${MATH_LIB} ${MATH_LIB}
${MKLDNN_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB}) ${EXTERNAL_LIB})
...@@ -95,9 +95,9 @@ function cmake_gen() { ...@@ -95,9 +95,9 @@ function cmake_gen() {
exit 1 exit 1
fi fi
fi fi
else else
if [ "$1" != "" ]; then if [ "$1" != "" ]; then
echo "using python abi: $1" echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
export PATH=/opt/python/cp27-cp27m/bin/:${PATH} export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
...@@ -119,7 +119,7 @@ function cmake_gen() { ...@@ -119,7 +119,7 @@ function cmake_gen() {
fi fi
fi fi
fi fi
if [ "$SYSTEM" == "Darwin" ]; then if [ "$SYSTEM" == "Darwin" ]; then
WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON} WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
WITH_AVX=${WITH_AVX:-ON} WITH_AVX=${WITH_AVX:-ON}
...@@ -127,7 +127,7 @@ function cmake_gen() { ...@@ -127,7 +127,7 @@ function cmake_gen() {
else else
INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
fi fi
cat <<EOF cat <<EOF
======================================== ========================================
Configuring cmake in /paddle/build ... Configuring cmake in /paddle/build ...
...@@ -147,13 +147,11 @@ function cmake_gen() { ...@@ -147,13 +147,11 @@ function cmake_gen() {
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/ -DCUDNN_ROOT=/usr/
-DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
...@@ -181,12 +179,10 @@ EOF ...@@ -181,12 +179,10 @@ EOF
-DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \ -DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
...@@ -394,8 +390,8 @@ EOF ...@@ -394,8 +390,8 @@ EOF
export http_proxy= export http_proxy=
export https_proxy= export https_proxy=
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j $1 ctest --output-on-failure -j $1
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
...@@ -653,20 +649,20 @@ function gen_capi_package() { ...@@ -653,20 +649,20 @@ function gen_capi_package() {
function gen_fluid_lib() { function gen_fluid_lib() {
mkdir -p ${PADDLE_ROOT}/build mkdir -p ${PADDLE_ROOT}/build
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Generating fluid library for train and inference ... Generating fluid library for train and inference ...
======================================== ========================================
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
make -j `nproc` fluid_lib_dist make -j `nproc` fluid_lib_dist
make -j `nproc` inference_lib_dist make -j `nproc` inference_lib_dist
fi fi
} }
function tar_fluid_lib() { function tar_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Taring fluid library for train and inference ... Taring fluid library for train and inference ...
...@@ -681,7 +677,7 @@ EOF ...@@ -681,7 +677,7 @@ EOF
} }
function test_fluid_lib() { function test_fluid_lib() {
if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Testing fluid library for inference ... Testing fluid library for inference ...
......
...@@ -78,7 +78,7 @@ def __build_dict(tar_file, dict_size, save_path, lang): ...@@ -78,7 +78,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
six.iteritems(word_dict), key=lambda x: x[1], six.iteritems(word_dict), key=lambda x: x[1],
reverse=True)): reverse=True)):
if idx + 3 == dict_size: break if idx + 3 == dict_size: break
fout.write("%s\n" % (word[0])) fout.write("%s\n" % (cpt.to_bytes(word[0])))
def __load_dict(tar_file, dict_size, lang, reverse=False): def __load_dict(tar_file, dict_size, lang, reverse=False):
......
...@@ -121,6 +121,9 @@ def __bootstrap__(): ...@@ -121,6 +121,9 @@ def __bootstrap__():
read_env_flags.append('rpc_server_profile_period') read_env_flags.append('rpc_server_profile_period')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
read_env_flags.append('enable_rpc_profiler') read_env_flags.append('enable_rpc_profiler')
read_env_flags.append('rpc_send_thread_num')
read_env_flags.append('rpc_get_thread_num')
read_env_flags.append('rpc_prefetch_thread_num')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -272,7 +272,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -272,7 +272,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
) )
square = grad * grad square = grad * grad
local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64') local_norm_var = layers.reduce_sum(input=square)
context[self.group_name].append(local_norm_var) context[self.group_name].append(local_norm_var)
self.context = context self.context = context
...@@ -282,7 +282,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -282,7 +282,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
if group_scale_name not in self.context: if group_scale_name not in self.context:
group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sums(input=self.context[self.group_name])
group_norm_var = layers.sqrt(x=group_norm_var) group_norm_var = layers.sqrt(x=group_norm_var)
group_norm_var = layers.cast(group_norm_var, 'float32')
clip_var = self.context[self.group_name + "_clip"] clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div( group_scale_var = layers.elementwise_div(
x=clip_var, x=clip_var,
......
...@@ -316,7 +316,7 @@ class DetectionMAP(Evaluator): ...@@ -316,7 +316,7 @@ class DetectionMAP(Evaluator):
gt_label (Variable): The ground truth label index, which is a LoDTensor gt_label (Variable): The ground truth label index, which is a LoDTensor
with shape [N, 1]. with shape [N, 1].
gt_box (Variable): The ground truth bounding box (bbox), which is a gt_box (Variable): The ground truth bounding box (bbox), which is a
LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax]. LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
gt_difficult (Variable|None): Whether this ground truth is a difficult gt_difficult (Variable|None): Whether this ground truth is a difficult
bounding bbox, which can be a LoDTensor [N, 1] or not set. If None, bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
it means all the ground truth labels are not difficult bbox. it means all the ground truth labels are not difficult bbox.
......
...@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): ...@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
load_prog = Program() load_prog = Program()
load_block = load_prog.global_block() load_block = load_prog.global_block()
need_delete_vars = []
for var_tuple in slice_vars_and_attrs: for var_tuple in slice_vars_and_attrs:
orig_var = var_tuple[0] orig_var = var_tuple[0]
start = var_tuple[1] start = var_tuple[1]
slice_var = var_tuple[2] slice_var = var_tuple[2]
end = start + reduce(lambda x, y: x * y, slice_var.shape) end = start + slice_var.shape[0]
clone_orig_var = load_block.create_var( clone_orig_var = load_block.create_var(
name=orig_var.name, name=orig_var.name,
...@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): ...@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
attrs={'axes': [0], attrs={'axes': [0],
'starts': [start], 'starts': [start],
'ends': [end]}) 'ends': [end]})
need_delete_vars.append(clone_orig_var)
load_block.append_op(
type='delete_var',
inputs={'X': need_delete_vars}, )
executor.run(load_prog) executor.run(load_prog)
...@@ -1424,7 +1424,36 @@ def generate_proposal_labels(rpn_rois, ...@@ -1424,7 +1424,36 @@ def generate_proposal_labels(rpn_rois,
use_random=True): use_random=True):
""" """
** Generate proposal labels Faster-RCNN ** ** Generate proposal labels Faster-RCNN **
TODO(buxingyuan): Add Document This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
to sample foreground boxes and background boxes, and compute loss target.
RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
then it was considered as a background sample.
After all foreground and background boxes are chosen (so called Rois),
then we apply random sampling to make sure
the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
Args:
rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth.
is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd.
gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale.
batch_size_per_im(int): Batch size of rois per images.
fg_fraction(float): Foreground fraction in total batch_size_per_im.
fg_thresh(float): Overlap threshold which is used to chose foreground sample.
bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample.
bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample.
bbox_reg_weights(list|tuple): Box regression weights.
class_nums(int): Class number.
use_random(bool): Use random sampling to choose foreground and background boxes.
""" """
helper = LayerHelper('generate_proposal_labels', **locals()) helper = LayerHelper('generate_proposal_labels', **locals())
...@@ -1487,7 +1516,7 @@ def generate_proposals(scores, ...@@ -1487,7 +1516,7 @@ def generate_proposals(scores,
eta=1.0, eta=1.0,
name=None): name=None):
""" """
** Generate proposal labels Faster-RCNN ** ** Generate proposal Faster-RCNN **
This operation proposes RoIs according to each box with their probability to be a foreground object and This operation proposes RoIs according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
......
...@@ -27,7 +27,7 @@ from . import nn ...@@ -27,7 +27,7 @@ from . import nn
from . import ops from . import ops
from . import tensor from . import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter, unique_name from ..framework import default_main_program, Parameter, unique_name, name_scope
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
...@@ -332,14 +332,16 @@ def append_LARS(params_grads, learning_rate, weight_decay): ...@@ -332,14 +332,16 @@ def append_LARS(params_grads, learning_rate, weight_decay):
return grad_norm + weight_decay * param_norm return grad_norm + weight_decay * param_norm
for param, grad in params_grads: for param, grad in params_grads:
param_lr = param.optimize_attr['learning_rate'] with param.block.program.optimized_guard(
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param))) [param, grad]), name_scope("optimizer"):
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == float and param_lr == 1.0: param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
decayed_lr = learning_rate * param_norm \ grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
/ _balanced_weight(param_norm, grad_norm) if type(param_lr) == float and param_lr == 1.0:
else: decayed_lr = learning_rate * param_norm \
decayed_lr = learning_rate * param_lr * param_norm \ / _balanced_weight(param_norm, grad_norm)
/ _balanced_weight(param_norm, grad_norm) else:
# set back param local learning rate decayed_lr = learning_rate * param_lr * param_norm \
param.optimize_attr['learning_rate'] = decayed_lr / _balanced_weight(param_norm, grad_norm)
# set back param local learning rate
param.optimize_attr['learning_rate'] = decayed_lr
...@@ -154,7 +154,12 @@ __all__ = [ ...@@ -154,7 +154,12 @@ __all__ = [
'mul', 'mul',
'sigmoid_cross_entropy_with_logits', 'sigmoid_cross_entropy_with_logits',
'maxout', 'maxout',
'affine_grid',
'sequence_reverse',
'affine_channel', 'affine_channel',
'hash',
'log_loss',
'add_position_encoding',
] ]
...@@ -706,8 +711,18 @@ def dynamic_gru(input, ...@@ -706,8 +711,18 @@ def dynamic_gru(input,
The first part are weights of the update gate and reset gate with The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`. candidate hidden state with shape :math:`(D \\times D)`.
bias_attr(ParamAttr): The parameter attribute for learnable the
hidden-hidden bias. If it is set to None or one attribute of ParamAttr, dynamic_gru will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, dynamic_gru will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
is_reverse(bool): Whether to compute reversed GRU, default is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`. :attr:`False`.
gate_activation(str): The activation for update gate and reset gate. gate_activation(str): The activation for update gate and reset gate.
...@@ -745,7 +760,7 @@ def dynamic_gru(input, ...@@ -745,7 +760,7 @@ def dynamic_gru(input,
attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
batch_size = input.shape[0] batch_size = input.shape[0]
inputs = {'Input': input, 'Weight': weight, 'Bias': bias} inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
if h_0 != None: if h_0:
assert h_0.shape == ( assert h_0.shape == (
batch_size, size batch_size, size
), 'The shape of h0 should be(batch_size, %d)' % size ), 'The shape of h0 should be(batch_size, %d)' % size
...@@ -806,10 +821,29 @@ def gru_unit(input, ...@@ -806,10 +821,29 @@ def gru_unit(input,
Args: Args:
input (Variable): The fc transformed input value of current step. input (Variable): The fc transformed input value of current step.
hidden (Variable): The hidden value of lstm unit from previous step. hidden (Variable): The hidden value of gru unit from previous step.
size (integer): The input dimension value. size (integer): The input dimension value.
param_attr (ParamAttr): The weight parameters for gru unit. Default: None param_attr(ParamAttr|None): The parameter attribute for the learnable
bias_attr (ParamAttr): The bias parameters for gru unit. Default: None hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode). activation (string): The activation type for cell (actNode).
Default: 'tanh' Default: 'tanh'
gate_activation (string): The activation type for gates (actGate). gate_activation (string): The activation type for gates (actGate).
...@@ -980,7 +1014,12 @@ def cos_sim(X, Y): ...@@ -980,7 +1014,12 @@ def cos_sim(X, Y):
return out return out
def dropout(x, dropout_prob, is_test=False, seed=None, name=None): def dropout(x,
dropout_prob,
is_test=False,
seed=None,
name=None,
dropout_implementation="downgrade_in_infer"):
""" """
Computes dropout. Computes dropout.
...@@ -1000,6 +1039,21 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): ...@@ -1000,6 +1039,21 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
units will be dropped. DO NOT use a fixed seed in training. units will be dropped. DO NOT use a fixed seed in training.
name (str|None): A name for this layer(optional). If set None, the layer name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
1. downgrade_in_infer(default), downgrade the outcome at inference
train: out = input * mask
inference: out = input * dropout_prob
(make is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob)
2. upscale_in_train, upscale the outcome at training time
train: out = input * mask / ( 1.0 - dropout_prob )
inference: out = input
(make is a tensor same shape with input, value is 0 or 1
ratio of 0 is dropout_prob)
dropout op can be removed from the program.
the program will be efficient
Returns: Returns:
Variable: A tensor variable is the shape with `x`. Variable: A tensor variable is the shape with `x`.
...@@ -1029,7 +1083,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): ...@@ -1029,7 +1083,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
'dropout_prob': dropout_prob, 'dropout_prob': dropout_prob,
'is_test': is_test, 'is_test': is_test,
'fix_seed': seed is not None, 'fix_seed': seed is not None,
'seed': seed if seed is not None else 0 'seed': seed if seed is not None else 0,
'dropout_implementation': dropout_implementation,
}) })
return out return out
...@@ -1800,7 +1855,7 @@ def conv3d(input, ...@@ -1800,7 +1855,7 @@ def conv3d(input,
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
def sequence_pool(input, pool_type): def sequence_pool(input, pool_type, is_test=False):
""" """
This function add the operator for sequence pooling. This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied It pools features of all time-steps of each instance, and is applied
...@@ -1837,6 +1892,7 @@ def sequence_pool(input, pool_type): ...@@ -1837,6 +1892,7 @@ def sequence_pool(input, pool_type):
input(variable): The input variable which is a LoDTensor. input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool. pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max. It supports average, sum, sqrt and max.
is_test(bool, Default False): Used distinguish training from scoring mode.
Returns: Returns:
The sequence pooling variable which is a Tensor. The sequence pooling variable which is a Tensor.
...@@ -1864,7 +1920,8 @@ def sequence_pool(input, pool_type): ...@@ -1864,7 +1920,8 @@ def sequence_pool(input, pool_type):
inputs={"X": input}, inputs={"X": input},
outputs={"Out": pool_out, outputs={"Out": pool_out,
"MaxIndex": max_index}, "MaxIndex": max_index},
attrs={"pooltype": pool_type.upper()}) attrs={"pooltype": pool_type.upper(),
"is_test": is_test})
# when pool_type is max, variable max_index is initialized, # when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here # so we stop the gradient explicitly here
...@@ -1969,17 +2026,17 @@ def sequence_slice(input, offset, length, name=None): ...@@ -1969,17 +2026,17 @@ def sequence_slice(input, offset, length, name=None):
""" """
**Sequence Slice Layer** **Sequence Slice Layer**
The layer crops a subsequence from given sequence with given start The layer crops a subsequence from given sequence with given start
offset and subsequence length. offset and subsequence length.
It only supports sequence data (LoDTensor with lod_level equal to 1). It only supports sequence data (LoDTensor with lod_level equal to 1).
.. code-block:: text .. code-block:: text
- Case: - Case:
Given the input Variable **input**: Given the input Variable **input**:
input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]], input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
input.lod = [[3, 2]], input.lod = [[3, 2]],
input.dims = (5, 2), input.dims = (5, 2),
...@@ -1987,16 +2044,16 @@ def sequence_slice(input, offset, length, name=None): ...@@ -1987,16 +2044,16 @@ def sequence_slice(input, offset, length, name=None):
with offset.data = [[0], [1]] and length.data = [[2], [1]], with offset.data = [[0], [1]] and length.data = [[2], [1]],
the output Variable will be the output Variable will be
out.data = [[a1, a2], [b1, b2], [e1, e2]], out.data = [[a1, a2], [b1, b2], [e1, e2]],
out.lod = [[2, 1]], out.lod = [[2, 1]],
out.dims = (3, 2). out.dims = (3, 2).
NOTE: The first dimension size of **input**, **offset** and **length** NOTE: The first dimension size of **input**, **offset** and **length**
should be equal. The **offset** should start from 0. should be equal. The **offset** should start from 0.
Args: Args:
input(Variable): The input Variable which consists of the complete input(Variable): The input Variable which consists of the complete
sequences. sequences.
offset(Variable): The offset to slice each sequence. offset(Variable): The offset to slice each sequence.
length(Variable): The length of each subsequence. length(Variable): The length of each subsequence.
...@@ -2015,7 +2072,7 @@ def sequence_slice(input, offset, length, name=None): ...@@ -2015,7 +2072,7 @@ def sequence_slice(input, offset, length, name=None):
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32")) offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32")) length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length) length=length)
""" """
helper = LayerHelper("sequence_slice", **locals()) helper = LayerHelper("sequence_slice", **locals())
...@@ -2398,12 +2455,12 @@ def layer_norm(input, ...@@ -2398,12 +2455,12 @@ def layer_norm(input,
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
omitted. If :attr:`scale` is True and :attr:`param_attr` is None, omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as scale. The a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default None. :attr:`param_attr` is initialized as 1 if it is added. Default None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
omitted. If :attr:`shift` is True and :attr:`param_attr` is None, omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as bias. The a default :code:`ParamAttr` would be added as bias. The
:attr:`bias_attr` is initialized as 0 if it is added. Default None. :attr:`bias_attr` is initialized as 0 if it is added. Default None.
act(str): Activation to be applied to the output of layer normalizaiton. act(str): Activation to be applied to the output of layer normalizaiton.
Default None. Default None.
...@@ -2993,7 +3050,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): ...@@ -2993,7 +3050,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
x = fluid.layers.data(name='y', shape=[10, 5], x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0])) pad_value = fluid.layers.assign(
input=numpy.array([0], dtype=numpy.float32))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value) out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
""" """
...@@ -3021,8 +3079,8 @@ def sequence_unpad(x, length, name=None): ...@@ -3021,8 +3079,8 @@ def sequence_unpad(x, length, name=None):
""" """
**Sequence Unpad Layer** **Sequence Unpad Layer**
This layer removes the padding data in the input sequences and convert This layer removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod them into sequences with actual length as output, identitied by lod
information. information.
.. code-block:: text .. code-block:: text
...@@ -3032,9 +3090,9 @@ def sequence_unpad(x, length, name=None): ...@@ -3032,9 +3090,9 @@ def sequence_unpad(x, length, name=None):
Given input Variable **x**: Given input Variable **x**:
x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0], x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0], [ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]], [11.0, 12.0, 13.0, 14.0, 15.0]],
in which there are 3 sequences padded to length 5, and the acutal length in which there are 3 sequences padded to length 5, and the acutal length
specified by input Variable **length**: specified by input Variable **length**:
length.data = [[2], [3], [4]], length.data = [[2], [3], [4]],
...@@ -3042,7 +3100,7 @@ def sequence_unpad(x, length, name=None): ...@@ -3042,7 +3100,7 @@ def sequence_unpad(x, length, name=None):
after unpadding, the output Variable will be: after unpadding, the output Variable will be:
out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]] out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
out.lod = [[2, 3, 4]] out.lod = [[2, 3, 4]]
Args: Args:
x(Variable): Input Variable which contains the padded sequences with x(Variable): Input Variable which contains the padded sequences with
...@@ -4415,7 +4473,10 @@ def transpose(x, perm, name=None): ...@@ -4415,7 +4473,10 @@ def transpose(x, perm, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32') # use append_batch_size=False to avoid prepending extra
# batch size in shape
x = fluid.layers.data(name='x', shape=[5, 10, 15],
dtype='float32', append_batch_size=False)
x_transposed = layers.transpose(x, perm=[1, 0, 2]) x_transposed = layers.transpose(x, perm=[1, 0, 2])
""" """
...@@ -4867,7 +4928,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): ...@@ -4867,7 +4928,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
return counter return counter
def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
""" """
Gives a new shape to the input Tensor without changing its data. Gives a new shape to the input Tensor without changing its data.
...@@ -4915,15 +4976,22 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4915,15 +4976,22 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
:attr:`shape` specifying shape. That is to :attr:`shape` specifying shape. That is to
say :attr:`actual_shape` has a higher priority say :attr:`actual_shape` has a higher priority
than :attr:`shape`. than :attr:`shape`.
act (str): The non-linear activation to be applied to output variable. act (str): The non-linear activation to be applied to the reshaped tensor
inplace(bool): If this flag is set true, the output variable.
shares data with input without copying, otherwise inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
a new output tensor is created operators. If this flag is set :attr:`True`, reuse input
whose data is copied from input x. :attr:`x` to reshape, which will change the shape of
tensor variable :attr:`x` and might cause errors when
:attr:`x` is used in multiple operators. If :attr:`False`,
preserve the shape :attr:`x` and create a new output tensor
variable whose data is copied from input x but reshaped.
name (str): The name of this layer. It is optional. name (str): The name of this layer. It is optional.
Returns: Returns:
Variable: The output tensor. Variable: The reshaped tensor variable if :attr:`act` is None. It is a \
new tensor variable if :attr:`inplace` is :attr:`False`, \
otherwise it is :attr:`x`. If :attr:`act` is not None, return \
the activated tensor variable.
Raises: Raises:
TypeError: if actual_shape is neither Variable nor None. TypeError: if actual_shape is neither Variable nor None.
...@@ -4934,7 +5002,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4934,7 +5002,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
data = fluid.layers.data( data = fluid.layers.data(
name='data', shape=[2, 4, 6], dtype='float32') name='data', shape=[2, 4, 6], dtype='float32')
reshaped = fluid.layers.reshape( reshaped = fluid.layers.reshape(
x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True) x=data, shape=[-1, 0, 3, 2], inplace=True)
""" """
if not (isinstance(shape, list) or isinstance(shape, tuple)): if not (isinstance(shape, list) or isinstance(shape, tuple)):
...@@ -4961,7 +5029,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4961,7 +5029,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"except one unknown dimension.") "except one unknown dimension.")
helper = LayerHelper("reshape2", **locals()) helper = LayerHelper("reshape2", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = x if inplace else helper.create_variable_for_type_inference(
dtype=x.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=x.dtype) x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape2", type="reshape2",
...@@ -5492,9 +5561,9 @@ def roi_align(input, ...@@ -5492,9 +5561,9 @@ def roi_align(input,
Examples: Examples:
.. code-block:: python .. code-block:: python
align_out = fluid.layers.roi_align(input=x, align_out = fluid.layers.roi_align(input=x,
rois=rois, rois=rois,
pooled_height=7, pooled_height=7,
pooled_width=7, pooled_width=7,
spatial_scale=0.5, spatial_scale=0.5,
sampling_ratio=-1) sampling_ratio=-1)
...@@ -6095,6 +6164,124 @@ def crop(x, shape=None, offsets=None, name=None): ...@@ -6095,6 +6164,124 @@ def crop(x, shape=None, offsets=None, name=None):
return out return out
def affine_grid(theta, out_shape, name=None):
"""
It generates a grid of (x,y) coordinates using the parameters of
the affine transformation that correspond to a set of points where
the input feature map should be sampled to produce the transformed
output feature map.
.. code-block:: text
* Case 1:
Given:
theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
out_shape = [2, 3, 5, 5]
Step 1:
Generate normalized coordinates according to out_shape.
The values of the normalized coordinates are in the interval between -1 and 1.
The shape of the normalized coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
Args:
theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
out_shape can be a Variable or a list or tuple.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output with shape [N, H, W, 2].
Raises:
ValueError: If the type of arguments is not supported.
Examples:
.. code-block:: python
theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
data = fluid.layers.affine_grid(theta, out_shape)
# or
data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
"""
helper = LayerHelper('affine_grid')
if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
isinstance(out_shape, Variable)):
raise ValueError("The out_shape should be a list, tuple or Variable.")
if not isinstance(theta, Variable):
raise ValueError("The theta should be a Variable.")
out = helper.create_variable_for_type_inference(theta.dtype)
ipts = {'Theta': theta}
attrs = {}
if isinstance(out_shape, Variable):
ipts['OutputShape'] = out_shape
else:
attrs['output_shape'] = out_shape
helper.append_op(
type='affine_grid',
inputs=ipts,
outputs={'Output': out},
attrs=None if len(attrs) == 0 else attrs)
return out
def rank_loss(label, left, right, name=None): def rank_loss(label, left, right, name=None):
""" """
**Rank loss layer for RankNet** **Rank loss layer for RankNet**
...@@ -7478,13 +7665,40 @@ def maxout(x, groups, name=None): ...@@ -7478,13 +7665,40 @@ def maxout(x, groups, name=None):
return out return out
@templatedoc()
def sequence_reverse(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${y_type}): ${y_comment}
"""
helper = LayerHelper("sequence_reverse", **locals())
if name is None:
out = helper.create_variable_for_type_inference(dtype=x.dtype)
else:
out = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
helper.append_op(
type="sequence_reverse",
inputs={"X": x},
outputs={"Y": out},
attrs=dict())
return out
def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
""" """
Applies a separate affine transformation to each channel of the input. Applies a separate affine transformation to each channel of the input.
Useful for replacing spatial batch norm with its equivalent fixed Useful for replacing spatial batch norm with its equivalent fixed
transformation. The input also can be 2D tensor and applies a affine transformation. The input also can be 2D tensor and applies a affine
transformation in second dimension. transformation in second dimension.
Args: Args:
x (Variable): Feature map input can be a 4D tensor with order NCHW x (Variable): Feature map input can be a 4D tensor with order NCHW
or NHWC. It also can be a 2D tensor and the affine transformation or NHWC. It also can be a 2D tensor and the affine transformation
...@@ -7517,3 +7731,127 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None): ...@@ -7517,3 +7731,127 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
attrs={"data_layout": data_layout}, attrs={"data_layout": data_layout},
outputs={"Out": out}) outputs={"Out": out})
return out return out
def hash(input, hash_size, num_hash=1, name=None):
"""
hash the input
Args:
input (Variable): The input variable which is a one-hot word.
hash_size (int): The space size for hash algorithm.
num_hash (int): The times of hash, default 1.
name (str, default None): The name of this layer.
Returns:
Variable: The hash result variable which is a LoDTensor.
Examples:
.. code-block:: python
word_dict = paddle.dataset.imdb.word_dict()
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
out = fluid.layers.hash(input=x, len(word_dict))
"""
helper = LayerHelper('hash', **locals())
out = helper.create_variable_for_type_inference(
helper.input_dtype(), stop_gradient=True)
helper.append_op(
type='hash',
inputs={'X': input},
outputs={'Out': out},
attrs={'num_hash': num_hash,
'mod_by': hash_size})
return out
def log_loss(input, label, epsilon=1e-4, name=None):
"""
**Negative Log Loss Layer**
This layer accepts input predictions and target label and returns the
negative log loss.
.. math::
Out = -label * \\log{(input + \\epsilon)}
- (1 - label) * \\log{(1 - input + \\epsilon)}
Args:
input (Variable|list): a 2-D tensor with shape [N x 1], where N is the
batch size. This input is a probability computed
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
epsilon (float): epsilon
name (string): the name of log_loss
Returns:
Variable: A 2-D tensor with shape [N x 1], the negative log loss.
Examples:
.. code-block:: python
prob = fluid.layers.sigmoid(net)
cost = fluid.layers.log_loss(input=prob, label=label)
"""
helper = LayerHelper('log_loss', **locals())
if name is None:
loss = helper.create_variable_for_type_inference(dtype=input.dtype)
else:
loss = helper.create_variable(
name=name, dtype=input.dtype, persistable=False)
helper.append_op(
type='log_loss',
inputs={'Predicted': [input],
'Labels': [label]},
outputs={'Loss': [loss]},
attrs={'epsilon': epsilon})
return loss
def add_position_encoding(input, alpha, beta, name=None):
"""
**Add Position Encoding Layer**
This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
output Tensor of shape [N x M x P] with positional encoding value.
Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
.. math::
PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})} \\\\
PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})} \\\\
Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
Where:
* PE(pos, 2i): the increment for the number at even position
* PE(pos, 2i + 1): the increment for the number at odd position
Args:
input (Variable): 3-D input tensor with shape [N x M x P]
alpha (float): multiple of Input Tensor
beta (float): multiple of Positional Encoding Tensor
name (string): the name of position encoding layer
Returns:
Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
Examples:
.. code-block:: python
position_tensor = fluid.layers.add_position_encoding(input=tensor)
"""
helper = LayerHelper('add_position_encoding', **locals())
dtype = helper.input_dtype()
if name is None:
out = helper.create_variable_for_type_inference(dtype=dtype)
else:
out = helper.create_variable(name=name, dtype=dtype, persistable=False)
helper.append_op(
type="add_position_encoding",
inputs={"X": input},
outputs={"Out": out},
attrs={"alpha": alpha,
"beta": beta})
return out
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
# limitations under the License. # limitations under the License.
""" """
Fluid Metrics Fluid Metrics
The metrics are accomplished via Python natively.
""" """
from __future__ import print_function from __future__ import print_function
...@@ -24,6 +22,12 @@ import copy ...@@ -24,6 +22,12 @@ import copy
import warnings import warnings
import six import six
from .layer_helper import LayerHelper
from .initializer import Constant
from . import unique_name
from .framework import Program, Variable, program_guard
from . import layers
__all__ = [ __all__ = [
'MetricBase', 'MetricBase',
'CompositeMetric', 'CompositeMetric',
...@@ -190,7 +194,7 @@ class CompositeMetric(MetricBase): ...@@ -190,7 +194,7 @@ class CompositeMetric(MetricBase):
or soft-label, should custom the corresponding update rule. or soft-label, should custom the corresponding update rule.
""" """
for m in self._metrics: for m in self._metrics:
ans.append(m.update(preds, labels)) m.update(preds, labels)
def eval(self): def eval(self):
""" """
...@@ -474,71 +478,10 @@ class EditDistance(MetricBase): ...@@ -474,71 +478,10 @@ class EditDistance(MetricBase):
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
) )
avg_distance = self.total_distance / self.seq_num avg_distance = self.total_distance / self.seq_num
avg_instance_error = self.instance_error / self.seq_num avg_instance_error = self.instance_error / float(self.seq_num)
return avg_distance, avg_instance_error return avg_distance, avg_instance_error
class DetectionMAP(MetricBase):
"""
Calculate the detection mean average precision (mAP).
mAP is the metric to measure the accuracy of object detectors
like Faster R-CNN, SSD, etc.
It is the average of the maximum precisions at different recall values.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Examples:
.. code-block:: python
pred = fluid.layers.fc(input=data, size=1000, act="tanh")
batch_map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
ap_version=ap_version)
metric = fluid.metrics.DetectionMAP()
for data in train_reader():
loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
batch_size = data[0]
metric.update(value=batch_map, weight=batch_size)
numpy_map = metric.eval()
"""
def __init__(self, name=None):
super(DetectionMAP, self).__init__(name)
# the current map value
self.value = .0
self.weight = .0
def update(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
self.value += value
self.weight += weight
def eval(self):
if self.weight == 0:
raise ValueError(
"There is no data in DetectionMAP Metrics. "
"Please check layers.detection_map output has added to DetectionMAP."
)
return self.value / self.weight
class Auc(MetricBase): class Auc(MetricBase):
""" """
Auc metric adapts to the binary classification. Auc metric adapts to the binary classification.
...@@ -616,3 +559,179 @@ class Auc(MetricBase): ...@@ -616,3 +559,179 @@ class Auc(MetricBase):
idx -= 1 idx -= 1
return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0 return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
class DetectionMAP(object):
"""
Calculate the detection mean average precision (mAP).
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
Args:
input (Variable): The detection results, which is a LoDTensor with shape
[M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
gt_label (Variable): The ground truth label index, which is a LoDTensor
with shape [N, 1].
gt_box (Variable): The ground truth bounding box (bbox), which is a
LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
gt_difficult (Variable|None): Whether this ground truth is a difficult
bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
it means all the ground truth labels are not difficult bbox.
class_num (int): The class number.
background_label (int): The index of background label, the background
label will be ignored. If set to -1, then all categories will be
considered, 0 by defalut.
overlap_threshold (float): The threshold for deciding true/false
positive, 0.5 by defalut.
evaluate_difficult (bool): Whether to consider difficult ground truth
for evaluation, True by defalut. This argument does not work when
gt_difficult is None.
ap_version (string): The average precision calculation ways, it must be
'integral' or '11point'. Please check
https://sanchom.wordpress.com/tag/average-precision/ for details.
- 11point: the 11-point interpolated average precision.
- integral: the natural integral of the precision-recall curve.
Examples:
.. code-block:: python
exe = fluid.Executor(place)
map_evaluator = fluid.Evaluator.DetectionMAP(input,
gt_label, gt_box, gt_difficult)
cur_map, accum_map = map_evaluator.get_map_var()
fetch = [cost, cur_map, accum_map]
for epoch in PASS_NUM:
map_evaluator.reset(exe)
for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example:
'cur_map_v' is the mAP of current mini-batch.
'accum_map_v' is the accumulative mAP of one pass.
"""
def __init__(self,
input,
gt_label,
gt_box,
gt_difficult=None,
class_num=None,
background_label=0,
overlap_threshold=0.5,
evaluate_difficult=True,
ap_version='integral'):
self.helper = LayerHelper('map_eval')
gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
if gt_difficult:
gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
else:
label = layers.concat([gt_label, gt_box], axis=1)
# calculate mean average precision (mAP) of current mini-batch
map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
ap_version=ap_version)
states = []
states.append(
self._create_state(
dtype='int32', shape=None, suffix='accum_pos_count'))
states.append(
self._create_state(
dtype='float32', shape=None, suffix='accum_true_pos'))
states.append(
self._create_state(
dtype='float32', shape=None, suffix='accum_false_pos'))
var = self._create_state(dtype='int32', shape=[1], suffix='has_state')
self.helper.set_variable_initializer(
var, initializer=Constant(value=int(0)))
self.has_state = var
# calculate accumulative mAP
accum_map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
has_state=self.has_state,
input_states=states,
out_states=states,
ap_version=ap_version)
layers.fill_constant(
shape=self.has_state.shape,
value=1,
dtype=self.has_state.dtype,
out=self.has_state)
self.cur_map = map
self.accum_map = accum_map
def _create_state(self, suffix, dtype, shape):
"""
Create state variable.
Args:
suffix(str): the state suffix.
dtype(str|core.VarDesc.VarType): the state data type
shape(tuple|list): the shape of state
Returns: State variable
"""
state = self.helper.create_variable(
name="_".join([unique_name.generate(self.helper.name), suffix]),
persistable=True,
dtype=dtype,
shape=shape)
return state
def get_map_var(self):
"""
Returns: mAP variable of current mini-batch and
accumulative mAP variable cross mini-batches.
"""
return self.cur_map, self.accum_map
def reset(self, executor, reset_program=None):
"""
Reset metric states at the begin of each pass/user specified batch.
Args:
executor(Executor): a executor for executing
the reset_program.
reset_program(Program|None): a single Program for reset process.
If None, will create a Program.
"""
def _clone_var_(block, var):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=var.persistable)
if reset_program is None:
reset_program = Program()
with program_guard(main_program=reset_program):
var = _clone_var_(reset_program.current_block(), self.has_state)
layers.fill_constant(
shape=var.shape, value=0, dtype=var.dtype, out=var)
executor.run(reset_program)
...@@ -120,6 +120,8 @@ class OpDescCreationMethod(object): ...@@ -120,6 +120,8 @@ class OpDescCreationMethod(object):
new_attr.strings.extend(user_defined_attr) new_attr.strings.extend(user_defined_attr)
elif attr.type == framework_pb2.BOOLEANS: elif attr.type == framework_pb2.BOOLEANS:
new_attr.bools.extend(user_defined_attr) new_attr.bools.extend(user_defined_attr)
elif attr.type == framework_pb2.LONGS:
new_attr.longs.extend(user_defined_attr)
elif attr.type == framework_pb2.INT_PAIRS: elif attr.type == framework_pb2.INT_PAIRS:
for p in user_defined_attr: for p in user_defined_attr:
pair = new_attr.int_pairs.add() pair = new_attr.int_pairs.add()
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import re import re
import sys
from collections import defaultdict from collections import defaultdict
from paddle.fluid.framework import Program, Variable, name_scope, default_main_program from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
from . import framework from . import framework
...@@ -32,7 +33,8 @@ __all__ = [ ...@@ -32,7 +33,8 @@ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer' 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
'LarsMomentumOptimizer'
] ]
...@@ -105,7 +107,6 @@ class Optimizer(object): ...@@ -105,7 +107,6 @@ class Optimizer(object):
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable: if type(param_lr) == Variable:
print("returns updated param lr ", param_lr)
return param_lr return param_lr
else: else:
if param_lr == 1.0: if param_lr == 1.0:
...@@ -400,6 +401,91 @@ class MomentumOptimizer(Optimizer): ...@@ -400,6 +401,91 @@ class MomentumOptimizer(Optimizer):
return momentum_op return momentum_op
class LarsMomentumOptimizer(Optimizer):
"""
Momentum optimizer with LARS support
The update equations are as follows:
.. math::
& local\_learning\_rate = learning\_rate * lars\_coeff * \\
\\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
& param = param - velocity
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor
lars_coeff (float): defines how much we trust the layer to change its weights.
lars_weight_decay (float): weight decay coefficient for decaying using LARS.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.LarsMomentum(learning_rate=0.2, momentum=0.1, lars_weight_decay=0.001)
optimizer.minimize(cost)
"""
_velocity_acc_str = "velocity"
def __init__(self,
learning_rate,
momentum,
lars_coeff=0.001,
lars_weight_decay=0.0005,
regularization=None,
name=None):
assert learning_rate is not None
assert momentum is not None
super(LarsMomentumOptimizer, self).__init__(
learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "lars_momentum"
self._momentum = momentum
self._lars_coeff = float(lars_coeff)
self._lars_weight_decay = float(lars_weight_decay)
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={
"ParamOut": param_and_grad[0],
"VelocityOut": velocity_acc
},
attrs={
"mu": self._momentum,
"lars_coeff": self._lars_coeff,
"lars_weight_decay": self._lars_weight_decay
})
return momentum_op
class AdagradOptimizer(Optimizer): class AdagradOptimizer(Optimizer):
""" """
**Adaptive Gradient Algorithm (Adagrad)** **Adaptive Gradient Algorithm (Adagrad)**
...@@ -1221,6 +1307,7 @@ DecayedAdagrad = DecayedAdagradOptimizer ...@@ -1221,6 +1307,7 @@ DecayedAdagrad = DecayedAdagradOptimizer
Adadelta = AdadeltaOptimizer Adadelta = AdadeltaOptimizer
RMSProp = RMSPropOptimizer RMSProp = RMSPropOptimizer
Ftrl = FtrlOptimizer Ftrl = FtrlOptimizer
LarsMomentum = LarsMomentumOptimizer
class ModelAverage(Optimizer): class ModelAverage(Optimizer):
......
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test if(NOT APPLE)
foreach(src ${TEST_OPS}) # default test
py_test(${src} SRCS ${src}.py) foreach(src ${TEST_OPS})
endforeach() py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_image_classification_vgg")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_image_classification_resnet")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
...@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE) ...@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME) ...@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
if (py_test_modules_SERIAL) if (py_test_modules_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
endif() endif()
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
endif() endif()
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
...@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE ...@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
endif()
...@@ -90,12 +90,14 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -90,12 +90,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
# Optimization # Optimization
opt = fluid.optimizer.AdamOptimizer( # TODO(typhoonzero): fix distributed adam optimizer
learning_rate=0.001, beta1=0.9, beta2=0.999) # opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
# Reader # Reader
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size) paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size) paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost) opt.minimize(avg_cost)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
from dist_mnist import cnn_model
DTYPE = "float32"
def test_merge_reader(repeat_batch_size=8):
orig_reader = paddle.dataset.mnist.test()
record_batch = []
b = 0
for d in orig_reader():
if b >= repeat_batch_size:
break
record_batch.append(d)
b += 1
while True:
for d in record_batch:
yield d
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
# Reader
train_reader = paddle.batch(test_merge_reader, batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
from functools import reduce
from test_dist_base import TestDistRunnerBase, runtime_main
from dist_mnist import cnn_model
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
class TestDistMnist2x2(TestDistRunnerBase):
def get_model(self, batch_size=2):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.LarsMomentumOptimizer(
learning_rate=0.001, momentum=0.9)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
if __name__ == "__main__":
runtime_main(TestDistMnist2x2)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import sys
import signal
import subprocess
import argparse
import time
import math
import random
from multiprocessing import Process
from functools import reduce
import numpy as np
import unittest
import six
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid import io
from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
def _load_persistable_vars(self, executor, dirname, program):
def _is_checkpoint_var(var):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
if "tmp_" in var.name:
return False
return var.persistable
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var,
filename=None)
def run_pserver(self, args):
self.get_model(batch_size=2)
# NOTE: pserver should not call memory optimize
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), args.endpoints,
args.trainers, args.sync_mode)
pserver_prog = t.get_pserver_program(args.current_endpoint)
startup_prog = t.get_startup_program(args.current_endpoint,
pserver_prog)
need_load = bool(int(os.getenv("LOAD", "0")))
model_dir = os.getenv("MODEL_DIR", "")
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
if need_load and model_dir:
self._load_persistable_vars(exe, model_dir, startup_prog)
exe.run(pserver_prog)
def run_trainer(self, args):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2)
if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist:
t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(),
args.endpoints, args.trainers,
args.sync_mode)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy()
if args.use_reduce:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
else:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
exe = fluid.ParallelExecutor(
args.use_cuda,
loss_name=avg_cost.name,
exec_strategy=strategy,
build_strategy=build_stra)
feed_var_list = [
var for var in trainer_prog.global_block().vars.values()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = train_reader()
def get_data():
origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc:
new_batch = []
for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id:
new_batch.append(item)
return new_batch
else:
return origin_batch
need_save = bool(int(os.getenv("SAVE", "0")))
model_dir = os.getenv("MODEL_DIR", "")
if need_save:
for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data()))
if need_save and model_dir:
io.save_persistables(startup_exe, model_dir, trainer_prog)
var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
print(np.ravel(var).tolist())
if __name__ == "__main__":
paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
runtime_main(TestDistSaveLoad2x2)
...@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word, ...@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word,
name=pos_enc_param_name, name=pos_enc_param_name,
trainable=False, trainable=False,
initializer=fluid.initializer.ConstantInitializer(0.001))) initializer=fluid.initializer.ConstantInitializer(0.001)))
src_pos_enc.stop_gradient = True
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
return layers.dropout( return layers.dropout(
enc_input, enc_input,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import math
import paddle.fluid.core as core
from op_test import OpTest
class TestAddPositionEncodingTensorOp(OpTest):
"""
This class is to test the AddPositionEncodingOp
"""
def setUp(self):
"""
the prepared section for add position encoding op
"""
self.op_type = "add_position_encoding"
self.dtype = np.float32
self.init_input_output()
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
self.outputs = {'Out': self.out}
self.attrs = {'alpha': self.alpha, 'beta': self.beta}
def test_check_output(self):
"""
check the correctness of output
"""
self.check_output()
def test_check_grad(self):
"""
check the correctness of grad
"""
self.check_grad(['X'], 'Out', max_relative_error=0.005)
def init_input_output(self):
"""
init the input and output for test cases
"""
self.alpha = 0.6
self.beta = 0.5
self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
self.out = np.copy(self.x)
batch_size = self.x.shape[0]
max_length = self.x.shape[1]
enc_size = self.x.shape[2]
half_shape = int(enc_size / 2)
for i in range(batch_size):
for j in range(max_length):
for k in range(half_shape):
val = j / pow(10000.0, k / (
half_shape - 1)) if half_shape > 1 else j / 10000.0
self.out[i, j, k] = \
self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
self.out[i, j, half_shape + k] = \
self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
class TestAddPositionEncodingLoDTensorOp(OpTest):
"""
This class is to test the AddPositionEncodingLoDTensorOp
"""
def setUp(self):
"""
the prepared section for add position encoding LoDTensor op
"""
self.op_type = "add_position_encoding"
self.dtype = np.float32
self.init_input_output()
self.inputs = {'X': (self.x, self.lod), }
self.outputs = {'Out': (self.out, self.lod)}
self.attrs = {'alpha': self.alpha, 'beta': self.beta}
def test_check_output(self):
"""
check the correctness of output
"""
self.check_output()
def test_check_grad(self):
"""
check the correctness of grad
"""
self.check_grad(['X'], 'Out', max_relative_error=0.005)
def init_input_output(self):
"""
init the input and output for test cases
"""
self.alpha = 0.6
self.beta = 0.5
self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
self.lod = [[3, 7]]
self.out = np.copy(self.x)
batch_size = len(self.lod[0])
enc_size = self.x.shape[1]
start = 0
half_shape = int(enc_size / 2)
for i in range(batch_size):
max_length = self.lod[0][i]
for j in range(max_length):
for k in range(half_shape):
val = j / pow(10000.0, k / (
half_shape - 1)) if half_shape > 1 else j / 10000.0
pos = start + j
self.out[pos, k] = \
self.x[pos, k] * self.alpha + math.sin(val) * self.beta
self.out[pos, half_shape + k] = \
self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
start += max_length
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def AffineGrid(theta, size):
n = size[0]
w = size[3]
h = size[2]
h_idx = np.repeat(
np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
w_idx = np.repeat(
np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
grid = np.concatenate(
[w_idx, h_idx, np.ones([h, w, 1])], axis=2) # h * w * 3
grid = np.repeat(grid[np.newaxis, :], size[0], axis=0) # n * h * w *3
ret = np.zeros([n, h * w, 2])
theta = theta.transpose([0, 2, 1])
for i in range(len(theta)):
ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
# print ret.reshape([h * w, 2]).astype("float32")
return ret.reshape([n, h, w, 2]).astype("float32")
class TestAffineGridOp(OpTest):
def setUp(self):
self.initTestCase()
self.op_type = "affine_grid"
theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
theta = np.ones(self.theta_shape).astype("float32")
self.inputs = {'Theta': theta}
self.attrs = {"use_cudnn": True}
if self.dynamic_shape:
self.inputs['OutputShape'] = self.output_shape
else:
self.attrs['output_shape'] = self.output_shape
self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['Theta'],
'Output',
no_grad_set=['OutputShape'],
max_relative_error=0.006)
def initTestCase(self):
self.theta_shape = (3, 2, 3)
self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
self.dynamic_shape = False
class TestAffineGridOpCase1(TestAffineGridOp):
def initTestCase(self):
self.theta_shape = (3, 2, 3)
self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
self.dynamic_shape = True
if __name__ == '__main__':
unittest.main()
...@@ -22,14 +22,17 @@ import signal ...@@ -22,14 +22,17 @@ import signal
import subprocess import subprocess
import six import six
import argparse import argparse
import pickle
import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
RUN_STEP = 10 RUN_STEP = 10
DEFAULT_BATCH_SIZE = 2
class TestDistRunnerBase(object): class TestDistRunnerBase(object):
def get_model(self, batch_size=2): def get_model(self, batch_size=DEFAULT_BATCH_SIZE):
raise NotImplementedError( raise NotImplementedError(
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
...@@ -48,8 +51,7 @@ class TestDistRunnerBase(object): ...@@ -48,8 +51,7 @@ class TestDistRunnerBase(object):
return t return t
def run_pserver(self, args): def run_pserver(self, args):
self.get_model(batch_size=args.batch_size)
self.get_model(batch_size=2)
# NOTE: pserver should not call memory optimize # NOTE: pserver should not call memory optimize
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), args.endpoints, fluid.default_main_program(), args.endpoints,
...@@ -65,7 +67,7 @@ class TestDistRunnerBase(object): ...@@ -65,7 +67,7 @@ class TestDistRunnerBase(object):
def run_trainer(self, args): def run_trainer(self, args):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
self.get_model(batch_size=2) self.get_model(batch_size=args.batch_size)
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
...@@ -92,6 +94,11 @@ class TestDistRunnerBase(object): ...@@ -92,6 +94,11 @@ class TestDistRunnerBase(object):
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_stra = fluid.BuildStrategy() build_stra = fluid.BuildStrategy()
if args.batch_merge_repeat > 1:
pass_builder = build_stra._create_passes_from_strategy()
mypass = pass_builder.insert_pass(
len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
mypass.set_int("num_repeats", args.batch_merge_repeat)
if args.use_reduce: if args.use_reduce:
build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
...@@ -123,10 +130,15 @@ class TestDistRunnerBase(object): ...@@ -123,10 +130,15 @@ class TestDistRunnerBase(object):
else: else:
return origin_batch return origin_batch
out_losses = []
for _ in six.moves.xrange(RUN_STEP): for _ in six.moves.xrange(RUN_STEP):
loss, = exe.run(fetch_list=[avg_cost.name], loss, = exe.run(fetch_list=[avg_cost.name],
feed=feeder.feed(get_data())) feed=feeder.feed(get_data()))
print(loss) out_losses.append(loss[0])
if six.PY2:
print(pickle.dumps(out_losses))
else:
sys.stdout.buffer.write(pickle.dumps(out_losses))
def runtime_main(test_class): def runtime_main(test_class):
...@@ -144,7 +156,10 @@ def runtime_main(test_class): ...@@ -144,7 +156,10 @@ def runtime_main(test_class):
parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_cuda', action='store_true')
parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--use_reduce', action='store_true')
parser.add_argument( parser.add_argument(
'--use_reader_alloc', action='store_true', required=False, default=True) '--use_reader_alloc', action='store_true', required=False)
parser.add_argument('--batch_size', required=False, type=int, default=2)
parser.add_argument(
'--batch_merge_repeat', required=False, type=int, default=1)
args = parser.parse_args() args = parser.parse_args()
...@@ -180,7 +195,7 @@ class TestDistBase(unittest.TestCase): ...@@ -180,7 +195,7 @@ class TestDistBase(unittest.TestCase):
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port()) self._find_free_port(), self._find_free_port())
self._python_interp = "python" self._python_interp = sys.executable
self._sync_mode = True self._sync_mode = True
self._enforce_place = None self._enforce_place = None
self._mem_opt = False self._mem_opt = False
...@@ -229,24 +244,18 @@ class TestDistBase(unittest.TestCase): ...@@ -229,24 +244,18 @@ class TestDistBase(unittest.TestCase):
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
def _wait_ps_ready(self, pid): def _run_local(self,
retry_times = 50 model,
while True: envs,
assert retry_times >= 0, "wait ps ready failed" check_error_log=False,
time.sleep(3) batch_size=DEFAULT_BATCH_SIZE,
try: batch_merge_repeat=1):
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error as e:
sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
(e, retry_times))
retry_times -= 1
def _run_local(self, model, envs, check_error_log):
cmd = "%s %s --role trainer" % (self._python_interp, model) cmd = "%s %s --role trainer" % (self._python_interp, model)
if batch_size != DEFAULT_BATCH_SIZE:
cmd += " --batch_size %d" % batch_size
if batch_merge_repeat > 1:
cmd += " --batch_merge_repeat %d" % batch_merge_repeat
if self.__use_cuda: if self.__use_cuda:
cmd += " --use_cuda" cmd += " --use_cuda"
...@@ -271,23 +280,20 @@ class TestDistBase(unittest.TestCase): ...@@ -271,23 +280,20 @@ class TestDistBase(unittest.TestCase):
env=envs) env=envs)
local_out, local_err = local_proc.communicate() local_out, local_err = local_proc.communicate()
local_ret = cpt.to_text(local_out)
if check_error_log: if check_error_log:
err_log.close() err_log.close()
sys.stderr.write('local_stdout: %s\n' % local_ret) sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
sys.stderr.write('local_stderr: %s\n' % local_err) sys.stderr.write('local_stderr: %s\n' % local_err)
local_losses = local_ret.split("\n") return pickle.loads(local_out)
return local_losses
def _run_cluster(self, model, envs, check_error_log): def _run_cluster(self, model, envs, check_error_log):
# Run dist train to compare with local results # Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
check_error_log, envs) check_error_log, envs)
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
...@@ -322,8 +328,8 @@ class TestDistBase(unittest.TestCase): ...@@ -322,8 +328,8 @@ class TestDistBase(unittest.TestCase):
env0.update(envs) env0.update(envs)
env1.update(envs) env1.update(envs)
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) print("tr0_cmd:{}".format(tr0_cmd))
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) print("tr1_cmd:{}".format(tr1_cmd))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb")
...@@ -339,9 +345,7 @@ class TestDistBase(unittest.TestCase): ...@@ -339,9 +345,7 @@ class TestDistBase(unittest.TestCase):
env=env1) env=env1)
tr0_out, tr0_err = tr0_proc.communicate() tr0_out, tr0_err = tr0_proc.communicate()
tr0_loss_text = cpt.to_text(tr0_out)
tr1_out, tr1_err = tr1_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate()
tr1_loss_text = cpt.to_text(tr1_out)
# close trainer file # close trainer file
tr0_pipe.close() tr0_pipe.close()
...@@ -356,15 +360,13 @@ class TestDistBase(unittest.TestCase): ...@@ -356,15 +360,13 @@ class TestDistBase(unittest.TestCase):
ps1.terminate() ps1.terminate()
# print log # print log
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
tr0_losses = tr0_loss_text.split("\n") # return tr0_losses, tr1_losses
tr1_losses = tr1_loss_text.split("\n") return pickle.loads(tr0_out), pickle.loads(tr1_out)
return tr0_losses, tr1_losses
def check_with_place(self, def check_with_place(self,
model_file, model_file,
...@@ -394,9 +396,9 @@ class TestDistBase(unittest.TestCase): ...@@ -394,9 +396,9 @@ class TestDistBase(unittest.TestCase):
check_error_log) check_error_log)
for step_id in range(RUN_STEP): for step_id in range(RUN_STEP):
local_loss = eval(local_losses[step_id])[0] local_loss = local_losses[step_id]
tr0_loss = eval(tr0_losses[step_id])[0] tr0_loss = tr0_losses[step_id]
tr1_loss = eval(tr1_losses[step_id])[0] tr1_loss = tr1_losses[step_id]
dist_loss = (tr0_loss + tr1_loss) / 2 dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
print(str(local_loss) + ":" + str(dist_loss)) print("=======", local_loss, ":", dist_loss[0], "=======")
self.assertAlmostEqual(local_loss, dist_loss, delta=delta) self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
...@@ -18,14 +18,14 @@ import unittest ...@@ -18,14 +18,14 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
# FIXME(tangwei): sum op can not handle when inputs is empty.
class TestDistCTR2x2(TestDistBase): class TestDistCTR2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._enforce_place = "CPU" self._enforce_place = "CPU"
def test_dist_ctr(self):
def test_dist_ctr(self): self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -26,6 +26,15 @@ class TestDistMnist2x2(TestDistBase): ...@@ -26,6 +26,15 @@ class TestDistMnist2x2(TestDistBase):
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnist2x2Lars(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
def test_se_resnext(self):
self.check_with_place("dist_mnist_lars.py", delta=1e-5)
class TestDistMnist2x2WithMemopt(TestDistBase): class TestDistMnist2x2WithMemopt(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
...@@ -40,8 +49,7 @@ class TestDistMnistAsync(TestDistBase): ...@@ -40,8 +49,7 @@ class TestDistMnistAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._use_reduce = False self._use_reduce = False
# FIXME(typhoonzero): fix async mode test later def test_dist_train(self):
def no_test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=200) self.check_with_place("dist_mnist.py", delta=200)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_dist_base import TestDistBase
import os
class TestDistMnist2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
def test_dist_train(self):
self.check_with_place("dist_mnist_batch_merge.py", delta=1e-5)
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1",
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
no_merge_losses = self._run_local(
model_file,
required_envs,
check_error_log=check_error_log,
batch_size=4)
batch_merge_losses = self._run_local(
model_file,
required_envs,
check_error_log=check_error_log,
batch_size=2,
batch_merge_repeat=2)
# Ensure both result have values.
self.assertGreater(len(no_merge_losses), 1)
self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import shutil
import unittest
import tempfile
import numpy as np
from test_dist_base import TestDistBase, RUN_STEP
class TestDistSaveLoadDense2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._enforce_place = "CPU"
def check_with_place(self,
model_file,
delta=1e-3,
check_error_log=False,
need_envs={}):
required_envs = {
"PATH": os.getenv("PATH", ""),
"PYTHONPATH": os.getenv("PYTHONPATH", ""),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"http_proxy": ""
}
required_envs.update(need_envs)
if check_error_log:
required_envs["GLOG_v"] = "7"
required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp()
local_env = {}
local_env["SAVE"] = "1"
local_env["MODEL_DIR"] = model_dir
local_env.update(required_envs)
cluster_env = {}
cluster_env["LOAD"] = "1"
cluster_env["MODEL_DIR"] = model_dir
cluster_env.update(required_envs)
local_var = self._run_local(model_file, local_env, check_error_log)
tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
check_error_log)
shutil.rmtree(model_dir)
local_np = np.array(eval(local_var[0]))
train0_np = np.array(eval(tr0_var[0]))
train1_np = np.array(eval(tr1_var[0]))
self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
@unittest.skip(reason="CI fail")
def test_dist(self):
need_envs = {
"IS_DISTRIBUTED": '0',
"IS_SPARSE": '0',
'IS_SELF_CONTAINED_LR': '1'
}
self.check_with_place(
"dist_save_load.py",
delta=0,
check_error_log=False,
need_envs=need_envs)
if __name__ == "__main__":
unittest.main()
...@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
self._use_reader_alloc = False self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistseResnXt2x2WithMemopt(TestDistBase): class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
self._mem_opt = True self._mem_opt = True
self._use_reader_alloc = False
def test_dist_train(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistSeResneXt2x2Async(TestDistBase):
...@@ -40,8 +41,7 @@ class TestDistSeResneXt2x2Async(TestDistBase): ...@@ -40,8 +41,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._use_reader_alloc = False self._use_reader_alloc = False
#FIXME(typhoonzero): fix async mode later def test_dist_train(self):
def no_test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)
......
...@@ -42,7 +42,6 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase): ...@@ -42,7 +42,6 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._enforce_place = "CPU" self._enforce_place = "CPU"
#FIXME(typhoonzero): fix async tests later
def no_test_simnet_bow(self): def no_test_simnet_bow(self):
need_envs = { need_envs = {
"IS_DISTRIBUTED": '0', "IS_DISTRIBUTED": '0',
...@@ -79,8 +78,7 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -79,8 +78,7 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._enforce_place = "CPU" self._enforce_place = "CPU"
#FIXME(typhoonzero): fix async tests later def test_simnet_bow(self):
def no_test_simnet_bow(self):
need_envs = { need_envs = {
"IS_DISTRIBUTED": '0', "IS_DISTRIBUTED": '0',
"IS_SPARSE": '1', "IS_SPARSE": '1',
...@@ -94,7 +92,6 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase): ...@@ -94,7 +92,6 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
# FIXME(tangwei): Learningrate variable is not created on pserver. # FIXME(tangwei): Learningrate variable is not created on pserver.
"""
class TestDistSimnetBow2x2LookupTableSync(TestDistBase): class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
...@@ -147,7 +144,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase): ...@@ -147,7 +144,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
delta=1e-5, delta=1e-5,
check_error_log=False, check_error_log=False,
need_envs=need_envs) need_envs=need_envs)
"""
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest): ...@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
trainer, _ = self.get_trainer() trainer, _ = self.get_trainer()
class TestFtrl(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
opt = fluid.optimizer.Ftrl(learning_rate=0.1)
opt.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer, _ = self.get_trainer()
class TestLRDecayConditional(TranspilerTest): class TestLRDecayConditional(TranspilerTest):
def net_conf(self): def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32') x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
...@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest): ...@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
["sum", "scale", "scale", "elementwise_add", "momentum"]) ["sum", "scale", "scale", "elementwise_add", "momentum"])
class TestEmptyPserverOptimizeBlocks(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
# only one parameter
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=False)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
sgd_optimizer.minimize(avg_cost)
def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False
pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config)
self.assertEqual(len(pserver.blocks), 2)
self.assertEqual(len(pserver.blocks[1].ops), 0)
class TestDistLookupTableBase(TranspilerTest): class TestDistLookupTableBase(TranspilerTest):
def network_with_table(self, is_sparse, is_distributed): def network_with_table(self, is_sparse, is_distributed):
self.table_size = 1000 self.table_size = 1000
self.emb_size = 64 self.emb_size = 64
self.lookup_table_name = 'shared_w' self.lookup_table_name = 'shared_w'
def emb_pool(ids): def emb_pool(ids, table_name, is_distributed):
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
input=ids, input=ids,
size=[self.table_size, self.emb_size], size=[self.table_size, self.emb_size],
dtype='float32', dtype='float32',
param_attr=self.lookup_table_name, # share parameter param_attr=table_name,
is_sparse=is_sparse, is_sparse=is_sparse,
is_distributed=is_distributed) is_distributed=is_distributed)
pool = fluid.layers.sequence_pool(input=emb, pool_type='average') pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
...@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest): ...@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
name='title_ids', shape=[1], dtype='int64', lod_level=1) name='title_ids', shape=[1], dtype='int64', lod_level=1)
brand_ids = fluid.layers.data( brand_ids = fluid.layers.data(
name='brand_ids', shape=[1], dtype='int64', lod_level=1) name='brand_ids', shape=[1], dtype='int64', lod_level=1)
title_emb = emb_pool(title_ids) profile_ids = fluid.layers.data(
brand_emb = emb_pool(brand_ids) name='brand_ids', shape=[1], dtype='int64', lod_level=1)
fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1) title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
profile_emb = emb_pool(profile_ids, "profile_emb", False)
fc0 = fluid.layers.concat(
input=[title_emb, brand_emb, profile_emb], axis=1)
predict = fluid.layers.fc(input=fc0, predict = fluid.layers.fc(input=fc0,
size=2, size=2,
act=None, act=None,
...@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep) pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 3) self.assertEqual(len(pserver1.blocks), 4)
# 0 listen_and_serv # 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam # 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops], self.assertEqual([op.type for op in pserver1.blocks[1].ops],
...@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self.assertEqual([op.type for op in pserver1.blocks[2].ops], self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "scale", "adam", "scale", "scale"]) ["sum", "scale", "adam", "scale", "scale"])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["sum", "scale", "adam", "scale", "scale"])
trainer, _ = self.get_trainer() trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'fill_constant', 'mean_grad', 'cross_entropy_grad', 'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'split_selected_rows', 'send', 'sequence_pool_grad',
'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
...@@ -485,31 +540,43 @@ class TestDistLookupTable(TestDistLookupTableBase): ...@@ -485,31 +540,43 @@ class TestDistLookupTable(TestDistLookupTableBase):
# 1 optimize for fc_w or fc_b adam # 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops], self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["sum", "scale", "adam", "scale", "scale"]) ["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table sgd # 4 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[2].ops], self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table sgd
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["sum", "sgd"]) ["sum", "sgd"])
# 3 prefetch -> lookup_sparse_table for data0 # 3 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["lookup_sparse_table"])
# 4 prefetch -> lookup_sparse_table for data1
self.assertEqual([op.type for op in pserver1.blocks[4].ops], self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"]) ["lookup_sparse_table"])
# 5 save table # 5 save table
self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
trainer, _ = self.get_trainer() trainer, trainer_startup = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'split_selected_rows', 'send',
'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'fetch_barrier' 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
startup_ops = [
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'uniform_random',
'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
'fake_init'
]
self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
startup_ops)
class TestAsyncLocalLookupTable(TestDistLookupTableBase): class TestAsyncLocalLookupTable(TestDistLookupTableBase):
...@@ -520,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ...@@ -520,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False) pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
self.assertEqual(len(pserver1.blocks), 3) self.assertEqual(len(pserver1.blocks), 4)
# 0 listen_and_serv # 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam # 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops], self.assertEqual([op.type for op in pserver1.blocks[1].ops],
...@@ -529,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): ...@@ -529,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops], self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["adam", "scale", "scale"]) ["adam", "scale", "scale"])
# 3 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["adam", "scale", "scale"])
trainer, _ = self.get_trainer(config) trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
ops = [ ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'fill_constant', 'mean_grad', 'cross_entropy_grad', 'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv', 'split_selected_rows', 'send', 'sequence_pool_grad',
'recv', 'recv', 'concat' 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv',
'recv', 'concat', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
...@@ -558,12 +631,12 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -558,12 +631,12 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 1 optimize for fc_w or fc_b adam # 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops], self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["adam", "scale", "scale"]) ["adam", "scale", "scale"])
# 2 optimize for table sgd # 2 optimize for table adam
self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"]) self.assertEqual([op.type for op in pserver1.blocks[2].ops],
# 3 prefetch -> lookup_sparse_table for data0 ["adam", "scale", "scale"])
self.assertEqual([op.type for op in pserver1.blocks[3].ops], # 3 optimize for table sgd
["lookup_sparse_table"]) self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"])
# 4 prefetch -> lookup_sparse_table for data1 # 4 prefetch -> lookup_sparse_table for data0
self.assertEqual([op.type for op in pserver1.blocks[4].ops], self.assertEqual([op.type for op in pserver1.blocks[4].ops],
["lookup_sparse_table"]) ["lookup_sparse_table"])
# 5 save table # 5 save table
...@@ -572,13 +645,15 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): ...@@ -572,13 +645,15 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
trainer, _ = self.get_trainer(config) trainer, _ = self.get_trainer(config)
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
ops = [ ops = [
'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul', 'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'lookup_table_grad', 'split_selected_rows', 'send',
'sum', 'split_ids', 'send', 'recv', 'recv' 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv',
'recv', 'concat'
] ]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
......
...@@ -85,6 +85,69 @@ class TestDropoutOp5(OpTest): ...@@ -85,6 +85,69 @@ class TestDropoutOp5(OpTest):
self.check_output() self.check_output()
class TestDropoutOp6(TestDropoutOp):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {
'dropout_prob': 1.0,
'fix_seed': True,
'is_test': False,
'dropout_implementation': 'upscale_in_train'
}
self.outputs = {
'Out': np.zeros((32, 64)).astype('float32'),
'Mask': np.zeros((32, 64)).astype('float32')
}
class TestDropoutOp7(TestDropoutOp):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
self.attrs = {
'dropout_prob': 0.0,
'fix_seed': True,
'is_test': False,
'dropout_implementation': 'upscale_in_train'
}
self.outputs = {
'Out': self.inputs['X'],
'Mask': np.ones((32, 64, 2)).astype('float32')
}
class TestDropoutOp8(OpTest):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {
'dropout_prob': 0.35,
'fix_seed': True,
'is_test': True,
'dropout_implementation': 'upscale_in_train'
}
self.outputs = {'Out': self.inputs['X']}
def test_check_output(self):
self.check_output()
class TestDropoutOp9(OpTest):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
self.attrs = {
'dropout_prob': 0.75,
'is_test': True,
'dropout_implementation': 'upscale_in_train'
}
self.outputs = {'Out': self.inputs['X']}
def test_check_output(self):
self.check_output()
class TestFP16DropoutOp(OpTest): class TestFP16DropoutOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestFakeInitOpSelectedRows(unittest.TestCase):
def check_with_place(self, place, is_selected_rows):
scope = core.Scope()
out_var_name = 'Out'
if is_selected_rows:
out_tensor = scope.var(out_var_name).get_selected_rows().get_tensor(
)
else:
out_tensor = scope.var(out_var_name).get_tensor()
var_shape = [4, 784]
# create and run fake_init_op
fake_init_op = Operator("fake_init", Out=out_var_name, shape=var_shape)
fake_init_op.run(scope, place)
self.assertEqual(var_shape, out_tensor._get_dims())
def test_fake_init_selected_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
for is_selected_rows in [True, False]:
self.check_with_place(place, is_selected_rows)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestScaleOp(OpTest):
def setUp(self):
self.op_type = "hash"
self.init_test_case()
self.inputs = {'X': (self.in_seq, self.lod)}
self.attrs = {'num_hash': 4, 'mod_by': 10000}
self.outputs = {'Out': (self.out_seq, self.lod)}
def init_test_case(self):
np.random.seed = 1
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
# self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
self.out_seq = [
[[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]],
[[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
[[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]],
[[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
[[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]],
[[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]],
[[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]],
[[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]],
[[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]],
[[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]],
[[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]],
[[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]],
[[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]],
[[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]],
[[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]]
]
self.out_seq = np.array(self.out_seq)
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
...@@ -865,6 +865,22 @@ class TestBook(unittest.TestCase): ...@@ -865,6 +865,22 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_affine_grid(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
out, ids = layers.argsort(input=data, axis=1)
theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
out_shape = layers.data(
name="out_shape", shape=[-1], dtype="float32")
data_0 = layers.affine_grid(theta, out_shape)
data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
self.assertIsNotNone(data_0)
self.assertIsNotNone(data_1)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): ...@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
exe.run(pserver_prog) exe.run(pserver_prog)
def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
trainer_id):
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
# loss function
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
# optimizer
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
ps1 = ip + ":" + str(int(port) + 1)
ps2 = ip + ":" + port
pserver_endpoints = ps1 + "," + ps2
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = False
t = fluid.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=sync_mode)
pserver_prog = t.get_pserver_program(ps2)
# pserver2 have no parameter
assert (len(pserver_prog.blocks) == 2)
assert (len(pserver_prog.blocks[1].ops) == 0)
pserver_startup = t.get_startup_program(ps2, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
class TestListenAndServOp(OpTest): class TestListenAndServOp(OpTest):
def setUp(self): def setUp(self):
self.ps_timeout = 5 self.ps_timeout = 5
...@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest): ...@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
self.trainers = 1 self.trainers = 1
self.trainer_id = 0 self.trainer_id = 0
def _start_pserver(self, use_cuda, sync_mode): def _start_pserver(self, use_cuda, sync_mode, pserver_func):
p = Process( p = Process(
target=run_pserver, target=pserver_func,
args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
self.trainer_id)) self.trainer_id))
p.daemon = True p.daemon = True
...@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest): ...@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
def test_handle_signal_in_serv_op(self): def test_handle_signal_in_serv_op(self):
# run pserver on CPU in sync mode # run pserver on CPU in sync mode
p1 = self._start_pserver(False, True) p1 = self._start_pserver(False, True, run_pserver)
self._wait_ps_ready(p1.pid)
# raise SIGTERM to pserver
os.kill(p1.pid, signal.SIGINT)
p1.join()
# run pserver on CPU in async mode
p2 = self._start_pserver(False, False, run_pserver)
self._wait_ps_ready(p2.pid)
# raise SIGTERM to pserver
os.kill(p2.pid, signal.SIGTERM)
p2.join()
def test_list_and_serv_run_empty_optimize_block(self):
# run pserver on CPU in sync mode
p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
self._wait_ps_ready(p1.pid) self._wait_ps_ready(p1.pid)
# raise SIGTERM to pserver # raise SIGTERM to pserver
...@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest): ...@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
p1.join() p1.join()
# run pserver on CPU in async mode # run pserver on CPU in async mode
p2 = self._start_pserver(False, False) p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
self._wait_ps_ready(p2.pid) self._wait_ps_ready(p2.pid)
# raise SIGTERM to pserver # raise SIGTERM to pserver
......
...@@ -22,15 +22,28 @@ from op_test import OpTest ...@@ -22,15 +22,28 @@ from op_test import OpTest
class TestMergeIdsOp(OpTest): class TestMergeIdsOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "merge_ids" self.op_type = "merge_ids"
ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') ids1 = np.array([[0], [2], [5], [6]]).astype('int64')
x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32') ids2 = np.array([[0], [2], [2], [3]]).astype('int64')
x1 = np.array([]).astype('float32')
x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6], rows1 = np.array([[0], [2]]).astype('int64')
[0.5, 0.6]]).astype('float32') rows2 = np.array([[3], [5]]).astype('int64')
out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3], rows3 = np.array([[6]]).astype('int64')
[0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]} x0 = np.array([[0.1, 0.2], [0.2, 0.3]]).astype('float32')
self.outputs = {'Out': out} x1 = np.array([[0.3, 0.4], [0.4, 0.5]]).astype('float32')
x2 = np.array([[0.5, 0.6]]).astype('float32')
out1 = np.array(
[[0.1, 0.2], [0.2, 0.3], [0.4, 0.5], [0.5, 0.6]]).astype('float32')
out2 = np.array(
[[0.1, 0.2], [0.2, 0.3], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
self.inputs = {
'Ids': [('ids1', ids1), ('ids2', ids2)],
"Rows": [('rows1', rows1), ('rows2', rows2), ('rows3', rows3)],
"X": [('x0', x0), ('x1', x1), ('x2', x2)]
}
self.outputs = {'Out': [('out1', out1), ('out2', out2)]}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
from paddle.fluid.framework import Program, program_guard
class TestMetricsDetectionMap(unittest.TestCase):
def test_detection_map(self):
program = fluid.Program()
with program_guard(program):
detect_res = fluid.layers.data(
name='detect_res',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
label = fluid.layers.data(
name='label',
shape=[10, 1],
append_batch_size=False,
dtype='float32')
box = fluid.layers.data(
name='bbox',
shape=[10, 4],
append_batch_size=False,
dtype='float32')
map_eval = fluid.metrics.DetectionMAP(
detect_res, label, box, class_num=21)
cur_map, accm_map = map_eval.get_map_var()
self.assertIsNotNone(cur_map)
self.assertIsNotNone(accm_map)
print(str(program))
if __name__ == '__main__':
unittest.main()
...@@ -90,6 +90,45 @@ class TestMomentumOp2(OpTest): ...@@ -90,6 +90,45 @@ class TestMomentumOp2(OpTest):
self.check_output() self.check_output()
class TestLarsMomentumOp(OpTest):
def setUp(self):
self.op_type = "lars_momentum"
param = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
velocity = np.zeros((123, 321)).astype("float32")
learning_rate = np.array([0.001]).astype("float32")
mu = 0.0001
lars_coeff = 0.001
lars_weight_decay = 0.0005
self.inputs = {
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate
}
self.attrs = {
'mu': mu,
'lars_coeff': lars_coeff,
'lars_weight_decay': lars_weight_decay
}
pnorm = np.sqrt(np.square(param).sum())
gnorm = np.sqrt(np.square(grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * param)
velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay *
param)
param_out = param - velocity_out
self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
def test_check_output(self):
self.check_output()
class TestSparseMomentumOp(unittest.TestCase): class TestSparseMomentumOp(unittest.TestCase):
def setUp(self): def setUp(self):
self.use_nesterov = False self.use_nesterov = False
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
from threading import Thread
def user_reader(inputs):
def _reader():
for d in inputs:
yield d
return _reader
def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
def _feeder():
for batch_data in batch_reader():
sample_batch = []
label_batch = []
for sample, label in batch_data:
sample_batch.append(sample)
label_batch.append([label])
tensor = core.LoDTensor()
label = core.LoDTensor()
place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace()
tensor.set(np.array(sample_batch, dtype=img_dtype), place)
label.set(np.array(label_batch, dtype="int64"), place)
yield [tensor, label]
return _feeder
class TestPyReader(unittest.TestCase):
def setUp(self):
self.capacity = 10
self.shapes = [(-1, 3, 2, 1), (-1, 1)]
self.lod_levels = [0, 0]
self.dtypes = ['float32', 'int64']
def test_pin_memory_pyreader(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
executor = fluid.Executor(place)
data_file = fluid.layers.py_reader(
capacity=self.capacity,
dtypes=self.dtypes,
lod_levels=self.lod_levels,
shapes=self.shapes)
# feed_queue = data_file.queue
read_out_data = fluid.layers.read_file(data_file)
self.inputs = []
for _ in range(10):
sample = np.random.uniform(
low=0, high=1, size=[3, 2, 1]).astype("float32")
label = np.random.uniform(
low=0, high=10, size=[1]).astype("int64")
self.inputs.append((sample, label))
self.input_tensors = []
for d, l in batch_feeder(
paddle.batch(
user_reader(self.inputs), batch_size=2),
pin_memory=True
if fluid.core.is_compiled_with_cuda() else False)():
ta = fluid.LoDTensorArray()
ta.append(d)
ta.append(l)
self.input_tensors.append(ta)
self.batched_inputs = []
for batch in paddle.batch(user_reader(self.inputs), batch_size=2)():
feed_d = []
feed_l = []
for d, l in batch:
feed_d.append(d)
feed_l.append([l])
self.batched_inputs.append([feed_d, feed_l])
data_file.decorate_tensor_provider(
batch_feeder(
paddle.batch(
user_reader(self.inputs), batch_size=2),
pin_memory=True
if fluid.core.is_compiled_with_cuda() else False))
executor.run(fluid.default_startup_program())
self.outputs = []
data_file.start()
for _ in self.input_tensors:
self.outputs.append(
executor.run(fetch_list=list(read_out_data)))
data_file.reset()
self.validate()
def validate(self):
self.assertEqual(len(self.batched_inputs), len(self.outputs))
for in_data_list, out_data_list in zip(self.batched_inputs,
self.outputs):
self.assertEqual(len(in_data_list), len(out_data_list))
in_data_list_np = [
np.array(in_lod_tensor) for in_lod_tensor in in_data_list
]
for in_data, out_data in zip(in_data_list_np, out_data_list):
self.assertTrue((in_data == out_data).all())
if __name__ == '__main__':
unittest.main()
...@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): ...@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
def compute(self, x, offset, out):
self.attrs = {'pooltype': "MAX", 'is_test': True}
for i in range(len(offset[0]) - 1):
sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
(-1, 3 * 11))
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
def test_check_grad(self):
"""Grad computation does not apply to Sequence MAX
Pool executed when is_test is true """
return
class TestSeqLastPool2D(TestSeqAvgPool2D): class TestSeqLastPool2D(TestSeqAvgPool2D):
def compute(self, x, offset, out): def compute(self, x, offset, out):
self.attrs = {'pooltype': "LAST"} self.attrs = {'pooltype': "LAST"}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from op_test import OpTest
import numpy as np
class TestSequenceReverseBase(OpTest):
def initParameters(self):
pass
def setUp(self):
self.size = (10, 3, 4)
self.lod = [2, 3, 5]
self.dtype = 'float32'
self.initParameters()
self.op_type = 'sequence_reverse'
self.x = np.random.random(self.size).astype(self.dtype)
self.y = self.get_output()
self.inputs = {'X': (self.x, [self.lod, ]), }
self.outputs = {'Y': (self.y, [self.lod, ]), }
def get_output(self):
tmp_x = np.reshape(self.x, newshape=[self.x.shape[0], -1])
tmp_y = np.ndarray(tmp_x.shape).astype(self.dtype)
prev_idx = 0
for cur_len in self.lod:
idx_range = range(prev_idx, prev_idx + cur_len)
tmp_y[idx_range, :] = np.flip(tmp_x[idx_range, :], 0)
prev_idx += cur_len
return np.reshape(tmp_y, newshape=self.x.shape).astype(self.dtype)
def test_output(self):
self.check_output(0)
def test_grad(self):
self.check_grad(['X'], 'Y')
class TestSequenceReserve1(TestSequenceReverseBase):
def initParameters(self):
self.size = (12, 10)
self.lod = [4, 5, 3]
class TestSequenceReverse2(TestSequenceReverseBase):
def initParameters(self):
self.size = (12, 10)
self.lod = [12]
if __name__ == '__main__':
unittest.main()
...@@ -25,18 +25,21 @@ from paddle.fluid.op import Operator ...@@ -25,18 +25,21 @@ from paddle.fluid.op import Operator
class TestSplitIdsOp(OpTest): class TestSplitIdsOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "split_ids" self.op_type = "split_ids"
ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') ids1 = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
ids2 = np.array([[6], [2], [3], [3], [5], [2], [6]]).astype('int64')
ids3 = np.array([[2], [2], [2], [3], [5], [5], [6]]).astype('int64')
out0 = np.array([[0], [3], [6]]).astype('int64') out0 = np.array([[0], [3], [6]]).astype('int64')
out1 = np.array([[]]).astype('int64') out1 = np.array([[]]).astype('int64')
out2 = np.array([[2], [2], [5], [5]]).astype('int64') out2 = np.array([[2], [5]]).astype('int64')
self.inputs = {'Ids': ids} self.inputs = {'Ids': [('ids1', ids1), ('ids2', ids2), ('ids3', ids3)]}
self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]} self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
class TestSpliteIds(unittest.TestCase): class TestSplitSelectedRows(unittest.TestCase):
def get_places(self): def get_places(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
return places return places
......
...@@ -99,7 +99,6 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -99,7 +99,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
out0_grad.set_height(height) out0_grad.set_height(height)
out0_grad_tensor = out0_grad.get_tensor() out0_grad_tensor = out0_grad.get_tensor()
np_array = np.ones((len(rows0), row_numel)).astype("float32") np_array = np.ones((len(rows0), row_numel)).astype("float32")
np_array[0, 0] = 2.0
out0_grad_tensor.set(np_array, place) out0_grad_tensor.set(np_array, place)
out1_grad = scope.var("out1@GRAD").get_selected_rows() out1_grad = scope.var("out1@GRAD").get_selected_rows()
...@@ -108,7 +107,6 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -108,7 +107,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
out1_grad.set_height(height) out1_grad.set_height(height)
out1_grad_tensor = out1_grad.get_tensor() out1_grad_tensor = out1_grad.get_tensor()
np_array = np.ones((len(rows1), row_numel)).astype("float32") np_array = np.ones((len(rows1), row_numel)).astype("float32")
np_array[0, 1] = 4.0
out1_grad_tensor.set(np_array, place) out1_grad_tensor.set(np_array, place)
x_grad = scope.var("X@GRAD").get_selected_rows() x_grad = scope.var("X@GRAD").get_selected_rows()
...@@ -121,11 +119,13 @@ class TestSpliteSelectedRows(unittest.TestCase): ...@@ -121,11 +119,13 @@ class TestSpliteSelectedRows(unittest.TestCase):
grad_op.run(scope, place) grad_op.run(scope, place)
self.assertEqual(x_grad.rows(), rows0 + rows1) merged_rows = set(rows0 + rows1)
self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
self.assertEqual(x_grad.height(), height) self.assertEqual(x_grad.height(), height)
print(np.array(x_grad.get_tensor()))
self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0]) self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1]) self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -45,16 +45,30 @@ class TestSumOp(OpTest): ...@@ -45,16 +45,30 @@ class TestSumOp(OpTest):
class TestSelectedRowsSumOp(OpTest): class TestSelectedRowsSumOp(OpTest):
def check_with_place(self, place): def check_with_place(self, place, inplace):
scope = core.Scope() self.height = 10
self.check_input_and_optput(scope, place, True, True, True) self.row_numel = 12
self.check_input_and_optput(scope, place, False, True, True) self.rows = [0, 1, 2, 3, 4, 5, 6]
self.check_input_and_optput(scope, place, False, False, True)
self.check_input_and_optput(scope, place, False, False, False) self.check_input_and_optput(core.Scope(), place, inplace, True, True,
True)
self.check_input_and_optput(core.Scope(), place, inplace, False, True,
True)
self.check_input_and_optput(core.Scope(), place, inplace, False, False,
True)
self.check_input_and_optput(core.Scope(), place, inplace, False, False,
False)
def _get_array(self, row_num, row_numel):
array = np.ones((row_num, row_numel)).astype("float32")
for i in range(row_num):
array[i] *= i
return array
def check_input_and_optput(self, def check_input_and_optput(self,
scope, scope,
place, place,
inplace,
w1_has_data=False, w1_has_data=False,
w2_has_data=False, w2_has_data=False,
w3_has_data=False): w3_has_data=False):
...@@ -64,35 +78,43 @@ class TestSelectedRowsSumOp(OpTest): ...@@ -64,35 +78,43 @@ class TestSelectedRowsSumOp(OpTest):
self.create_selected_rows(scope, place, "W3", w3_has_data) self.create_selected_rows(scope, place, "W3", w3_has_data)
# create Out Variable # create Out Variable
out = scope.var('Out').get_selected_rows() if inplace:
out_var_name = "W1"
else:
out_var_name = "Out"
out = scope.var(out_var_name).get_selected_rows()
# create and run sum operator # create and run sum operator
sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out') sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
sum_op.run(scope, place) sum_op.run(scope, place)
has_data_w_num = 0 has_data_w_num = 0
for w in [w1_has_data, w2_has_data, w3_has_data]: for has_data in [w1_has_data, w2_has_data, w3_has_data]:
if not w: if has_data:
has_data_w_num += 1 has_data_w_num += 1
self.assertEqual(7 * has_data_w_num, len(out.rows())) if has_data_w_num > 0:
self.assertEqual(len(out.rows()), 7)
self.assertTrue(
np.array_equal(
np.array(out.get_tensor()),
self._get_array(len(self.rows), self.row_numel) *
has_data_w_num))
else:
self.assertEqual(len(out.rows()), 0)
def create_selected_rows(self, scope, place, var_name, isEmpty): def create_selected_rows(self, scope, place, var_name, has_data):
# create and initialize W Variable # create and initialize W Variable
if not isEmpty: if has_data:
rows = [0, 1, 2, 3, 4, 5, 6] rows = self.rows
row_numel = 12
else: else:
rows = [] rows = []
row_numel = 12
var = scope.var(var_name) var = scope.var(var_name)
w_selected_rows = var.get_selected_rows() w_selected_rows = var.get_selected_rows()
w_selected_rows.set_height(len(rows)) w_selected_rows.set_height(self.height)
w_selected_rows.set_rows(rows) w_selected_rows.set_rows(rows)
w_array = np.ones((len(rows), row_numel)).astype("float32") w_array = self._get_array(len(rows), self.row_numel)
for i in range(len(rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor() w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place) w_tensor.set(w_array, place)
...@@ -100,9 +122,11 @@ class TestSelectedRowsSumOp(OpTest): ...@@ -100,9 +122,11 @@ class TestSelectedRowsSumOp(OpTest):
def test_w_is_selected_rows(self): def test_w_is_selected_rows(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
# currently only support CPU if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) for inplace in [True, False]:
self.check_with_place(place, inplace)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -35,6 +35,7 @@ import sys ...@@ -35,6 +35,7 @@ import sys
import numpy as np import numpy as np
import collections import collections
import six import six
import logging
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
...@@ -475,6 +476,26 @@ class DistributeTranspiler(object): ...@@ -475,6 +476,26 @@ class DistributeTranspiler(object):
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
delete_ops(self.origin_program.global_block(), lr_ops) delete_ops(self.origin_program.global_block(), lr_ops)
# delete table init op
if self.has_distributed_lookup_table:
table_var = self.startup_program.global_block().vars[
self.table_name]
table_param_init_op = []
for op in self.startup_program.global_block().ops:
if self.table_name in op.output_arg_names:
table_param_init_op.append(op)
init_op_num = len(table_param_init_op)
if init_op_num != 1:
raise ValueError("table init op num should be 1, now is " + str(
init_op_num))
table_init_op = table_param_init_op[0]
self.startup_program.global_block().append_op(
type="fake_init",
inputs={},
outputs={"Out": table_var},
attrs={"shape": table_init_op.attr('shape')})
delete_ops(self.startup_program.global_block(), table_param_init_op)
self.origin_program.__str__() self.origin_program.__str__()
if wait_port: if wait_port:
...@@ -713,7 +734,7 @@ in a single call.") ...@@ -713,7 +734,7 @@ in a single call.")
for _, op in enumerate(self.optimize_ops): for _, op in enumerate(self.optimize_ops):
# optimizer is connected to itself # optimizer is connected to itself
if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \ if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
op not in global_ops: op not in global_ops:
log("append opt op: ", op.type, op.input_arg_names, log("append opt op: ", op.type, op.input_arg_names,
merged_var) merged_var)
__append_optimize_op__(op, per_opt_block, __append_optimize_op__(op, per_opt_block,
...@@ -747,6 +768,15 @@ in a single call.") ...@@ -747,6 +768,15 @@ in a single call.")
prefetch_var_name_to_block_id.extend( prefetch_var_name_to_block_id.extend(
lookup_table_var_name_to_block_id) lookup_table_var_name_to_block_id)
if len(optimize_blocks) == 0:
logging.warn("pserver [" + str(endpoint) +
"] has no optimize block!!")
pre_block_idx = pserver_program.num_blocks - 1
empty_block = pserver_program._create_block(pre_block_idx)
optimize_blocks.append(empty_block)
# In some case, some parameter server will have no parameter to optimize
# So we give an empty optimize block to parameter server.
attrs = { attrs = {
"optimize_blocks": optimize_blocks, "optimize_blocks": optimize_blocks,
"endpoint": endpoint, "endpoint": endpoint,
...@@ -890,11 +920,11 @@ to transpile() call.") ...@@ -890,11 +920,11 @@ to transpile() call.")
block_idx = int(block_name.split(block_suffix)[1]) block_idx = int(block_name.split(block_suffix)[1])
orig_var = self.origin_program.global_block().vars[orig_var_name] orig_var = self.origin_program.global_block().vars[orig_var_name]
skip_numel = 0 skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name] slice_vars = self.param_var_mapping[orig_var_name]
for slice_var in slice_vars[:block_idx]: for slice_var in slice_vars[:block_idx]:
skip_numel += reduce(lambda x, y: x * y, slice_var.shape) skip_dim0 += slice_var.shape[0]
slice_vars_and_attrs.append([orig_var, skip_numel, param]) slice_vars_and_attrs.append([orig_var, skip_dim0, param])
return slice_vars_and_attrs return slice_vars_and_attrs
...@@ -1034,90 +1064,87 @@ to transpile() call.") ...@@ -1034,90 +1064,87 @@ to transpile() call.")
def _replace_lookup_table_op_with_prefetch(self, program, def _replace_lookup_table_op_with_prefetch(self, program,
pserver_endpoints): pserver_endpoints):
# 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
# self.all_prefetch_input_vars = self.all_in_ids_vars = []
# [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
# [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
self.all_prefetch_input_vars = [] self.all_prefetch_input_vars = []
# self.all_prefetch_input_vars =
# [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
# [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
self.all_prefetch_output_vars = [] self.all_prefetch_output_vars = []
self.all_out_emb_vars = []
lookup_table_op_index = -1
continue_search_lookup_table_op = True continue_search_lookup_table_op = True
while continue_search_lookup_table_op: while continue_search_lookup_table_op:
continue_search_lookup_table_op = False continue_search_lookup_table_op = False
all_ops = program.global_block().ops all_ops = program.global_block().ops
for op in all_ops: for op in all_ops:
if op.type == LOOKUP_TABLE_TYPE: if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input(
"W")[0]:
if not op.attr('is_distributed'):
raise RuntimeError(
"lookup_table_op that lookup an distributed embedding table"
"should set is_distributed to true")
continue_search_lookup_table_op = True continue_search_lookup_table_op = True
lookup_table_op_index = list(all_ops).index(op) lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list(
all_ops).index(op)
ids_name = op.input("Ids") ids_name = op.input("Ids")
out_name = op.output("Out") out_name = op.output("Out")
ids_var = program.global_block().vars[ids_name[0]] ids_var = program.global_block().vars[ids_name[0]]
prefetch_input_vars = self._create_splited_vars( self.all_in_ids_vars.append(ids_var)
source_var=ids_var,
block=program.global_block(),
tag="_prefetch_in_")
self.all_prefetch_input_vars.append(prefetch_input_vars)
out_var = program.global_block().vars[out_name[0]] out_var = program.global_block().vars[out_name[0]]
prefetch_output_vars = self._create_splited_vars( self.all_out_emb_vars.append(out_var)
source_var=out_var,
block=program.global_block(),
tag="_prefetch_out_")
self.all_prefetch_output_vars.append(prefetch_output_vars)
# insert split_ids_op
program.global_block()._insert_op(
index=lookup_table_op_index,
type="split_ids",
inputs={
'Ids': [
program.global_block().vars[varname]
for varname in ids_name
]
},
outputs={"Out": prefetch_input_vars})
# insert prefetch_op
program.global_block()._insert_op(
index=lookup_table_op_index + 1,
type="prefetch",
inputs={'X': prefetch_input_vars},
outputs={"Out": prefetch_output_vars},
attrs={
"epmap": pserver_endpoints,
# FIXME(qiao) temporarily disable this config because prefetch
# is not act as other rpc op, it's more like a forward op
# RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
# insert concat_op
program.global_block()._insert_op(
index=lookup_table_op_index + 2,
type="merge_ids",
inputs={
'Ids': [
program.global_block().vars[varname]
for varname in ids_name
],
'X': prefetch_output_vars
},
outputs={
"Out": [
program.global_block().vars[varname]
for varname in out_name
]
})
# delete lookup_table_op # delete lookup_table_op
delete_ops(program.global_block(), [op]) delete_ops(program.global_block(), [op])
# break for loop # break for loop
break break
for index in range(len(self.pserver_endpoints)):
in_var = program.global_block().create_var(
name=str("prefetch_compress_in_tmp_" + str(index)),
type=self.all_in_ids_vars[0].type,
shape=self.all_in_ids_vars[0].shape,
dtype=self.all_in_ids_vars[0].dtype)
self.all_prefetch_input_vars.append(in_var)
out_var = program.global_block().create_var(
name=str("prefetch_compress_out_tmp_" + str(index)),
type=self.all_out_emb_vars[0].type,
shape=self.all_out_emb_vars[0].shape,
dtype=self.all_out_emb_vars[0].dtype)
self.all_prefetch_output_vars.append(out_var)
# insert split_ids_op
program.global_block()._insert_op(
index=lookup_table_op_index,
type="split_ids",
inputs={'Ids': self.all_in_ids_vars},
outputs={"Out": self.all_prefetch_input_vars})
# insert prefetch_op
program.global_block()._insert_op(
index=lookup_table_op_index + 1,
type="prefetch",
inputs={'X': self.all_prefetch_input_vars},
outputs={"Out": self.all_prefetch_output_vars},
attrs={
"epmap": pserver_endpoints,
# FIXME(qiao) temporarily disable this config because prefetch
# is not act as other rpc op, it's more like a forward op
# RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
# insert concat_op
program.global_block()._insert_op(
index=lookup_table_op_index + 2,
type="merge_ids",
inputs={
'Ids': self.all_in_ids_vars,
'Rows': self.all_prefetch_input_vars,
'X': self.all_prefetch_output_vars
},
outputs={"Out": self.all_out_emb_vars})
def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
# 2. add split_ids_op and send_op to send gradient to pservers # 2. add split_ids_op and send_op to send gradient to pservers
...@@ -1134,7 +1161,8 @@ to transpile() call.") ...@@ -1134,7 +1161,8 @@ to transpile() call.")
inputs={ inputs={
'Ids': [program.global_block().vars[table_grad_name]] 'Ids': [program.global_block().vars[table_grad_name]]
}, },
outputs={"Out": self.trainer_side_table_grad_list}) outputs={"Out": self.trainer_side_table_grad_list},
attrs={RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE})
program.global_block()._insert_op( program.global_block()._insert_op(
index=op_index + 2, index=op_index + 2,
type="send", type="send",
...@@ -1160,32 +1188,31 @@ to transpile() call.") ...@@ -1160,32 +1188,31 @@ to transpile() call.")
# STEP: create prefetch block # STEP: create prefetch block
table_var = pserver_program.global_block().vars[self.table_name] table_var = pserver_program.global_block().vars[self.table_name]
prefetch_var_name_to_block_id = [] prefetch_var_name_to_block_id = []
for index in range(len(self.all_prefetch_input_vars)): prefetch_block = pserver_program._create_block(optimize_block.idx)
prefetch_block = pserver_program._create_block(optimize_block.idx) trainer_ids = self.all_prefetch_input_vars[pserver_index]
trainer_ids = self.all_prefetch_input_vars[index][pserver_index] pserver_ids = pserver_program.global_block().create_var(
pserver_ids = pserver_program.global_block().create_var( name=trainer_ids.name,
name=trainer_ids.name, type=trainer_ids.type,
type=trainer_ids.type, shape=trainer_ids.shape,
shape=trainer_ids.shape, dtype=trainer_ids.dtype)
dtype=trainer_ids.dtype) trainer_out = self.all_prefetch_output_vars[pserver_index]
trainer_out = self.all_prefetch_output_vars[index][pserver_index] pserver_out = pserver_program.global_block().create_var(
pserver_out = pserver_program.global_block().create_var( name=trainer_out.name,
name=trainer_out.name, type=trainer_out.type,
type=trainer_out.type, shape=trainer_out.shape,
shape=trainer_out.shape, dtype=trainer_out.dtype)
dtype=trainer_out.dtype) prefetch_block.append_op(
prefetch_block.append_op( type="lookup_sparse_table",
type="lookup_sparse_table", inputs={'Ids': pserver_ids,
inputs={'Ids': pserver_ids, "W": table_var},
"W": table_var}, outputs={"Out": pserver_out},
outputs={"Out": pserver_out}, attrs={
attrs={ "is_sparse": True, # has no effect on lookup_table op
"is_sparse": True, # has no effect on lookup_table op "is_distributed": True,
"is_distributed": True, "padding_idx": -1
"padding_idx": -1 })
}) prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str( prefetch_block.idx))
prefetch_block.idx))
return prefetch_var_name_to_block_id return prefetch_var_name_to_block_id
def _create_table_optimize_block(self, pserver_index, pserver_program, def _create_table_optimize_block(self, pserver_index, pserver_program,
...@@ -1263,7 +1290,6 @@ to transpile() call.") ...@@ -1263,7 +1290,6 @@ to transpile() call.")
} }
outputs = {"ParamOut": [param_var]} outputs = {"ParamOut": [param_var]}
# only support sgd now # only support sgd now
import logging
logging.warn( logging.warn(
"distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of " "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+ table_opt_op.type) + table_opt_op.type)
...@@ -1364,16 +1390,6 @@ to transpile() call.") ...@@ -1364,16 +1390,6 @@ to transpile() call.")
program.global_block()._sync_with_cpp() program.global_block()._sync_with_cpp()
return var_mapping return var_mapping
def _create_splited_vars(self, source_var, block, tag):
return [
block.create_var(
name=str(source_var.name + tag + str(index)),
type=source_var.type,
shape=source_var.shape,
dtype=source_var.dtype)
for index in range(len(self.pserver_endpoints))
]
def _clone_var(self, block, var, persistable=True): def _clone_var(self, block, var, persistable=True):
return block.create_var( return block.create_var(
name=var.name, name=var.name,
...@@ -1431,7 +1447,7 @@ to transpile() call.") ...@@ -1431,7 +1447,7 @@ to transpile() call.")
elif op_type == "adamax": elif op_type == "adamax":
if varkey in ["Moment", "InfNorm"]: if varkey in ["Moment", "InfNorm"]:
return param_shape return param_shape
elif op_type == "momentum": elif op_type in ["momentum", "lars_momentum"]:
if varkey == "Velocity": if varkey == "Velocity":
return param_shape return param_shape
elif op_type == "rmsprop": elif op_type == "rmsprop":
...@@ -1440,8 +1456,15 @@ to transpile() call.") ...@@ -1440,8 +1456,15 @@ to transpile() call.")
elif op_type == "decayed_adagrad": elif op_type == "decayed_adagrad":
if varkey == "Moment": if varkey == "Moment":
return param_shape return param_shape
elif op_type == "ftrl":
if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
return param_shape
elif op_type == "sgd": elif op_type == "sgd":
pass pass
else:
raise ValueError(
"Not supported optimizer for distributed training: %s" %
op_type)
return orig_shape return orig_shape
def _get_varname_parts(self, varname): def _get_varname_parts(self, varname):
......
...@@ -61,6 +61,9 @@ class InferenceTranspiler(object): ...@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if use_mkldnn:
self._depthwise_conv_mkldnn(program)
self._fuse_batch_norm(program, place, scope) self._fuse_batch_norm(program, place, scope)
if use_mkldnn: if use_mkldnn:
self._fuse_conv_bias_mkldnn(program) self._fuse_conv_bias_mkldnn(program)
...@@ -70,6 +73,31 @@ class InferenceTranspiler(object): ...@@ -70,6 +73,31 @@ class InferenceTranspiler(object):
program) # ResNet residual block merging program) # ResNet residual block merging
self._fuse_bn_relu_mkldnn(program) self._fuse_bn_relu_mkldnn(program)
def _depthwise_conv_mkldnn(self, program):
'''
Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
The result is:
- before:
- any_other_op->depthwise_conv->any_other_op
- after:
- any_other_op->conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
if current_op.type == 'depthwise_conv2d':
current_op.desc.set_type("conv2d")
i = i + 1
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_conv_eltwise_mkldnn(self, program): def _fuse_conv_eltwise_mkldnn(self, program):
''' '''
Transpile the program fusing elementwise_add into conv for MKLDNN Transpile the program fusing elementwise_add into conv for MKLDNN
......
...@@ -171,7 +171,7 @@ class ControlFlowGraph(object): ...@@ -171,7 +171,7 @@ class ControlFlowGraph(object):
self._live_out[i] |= self._live_in[s] self._live_out[i] |= self._live_in[s]
self._live_in[i] = self._uses[i] | ( self._live_in[i] = self._uses[i] | (
self._live_out[i] - self._defs[i]) self._live_out[i] - self._defs[i])
if live_in[i] != self._live_in[i]: if live_in[i] != set(self._live_in[i]):
for d in self._presuccessors[i]: for d in self._presuccessors[i]:
worklist.append(d) worklist.append(d)
...@@ -321,8 +321,7 @@ class ControlFlowGraph(object): ...@@ -321,8 +321,7 @@ class ControlFlowGraph(object):
if not compare_shape(x_shape, cache_shape, level): if not compare_shape(x_shape, cache_shape, level):
continue continue
# TODO(qijun): actually, we should compare # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
# dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
if x_dtype != cache_dtype: if x_dtype != cache_dtype:
continue continue
...@@ -487,7 +486,6 @@ def memory_optimize(input_program, ...@@ -487,7 +486,6 @@ def memory_optimize(input_program,
skip_opt_set = grad_set skip_opt_set = grad_set
else: else:
skip_opt_set.update(grad_set) skip_opt_set.update(grad_set)
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
......
...@@ -12,5 +12,5 @@ ...@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from plot import Ploter from .plot import Ploter
__all__ = ['dump_config', 'Ploter'] __all__ = ['dump_config', 'Ploter']
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册