diff --git a/Dockerfile b/Dockerfile index 402adee2ea2822250ebc8f6229fd6a44545d58e5..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ tar -xz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/README.md b/README.md index a67cb8ad439f462c361cb6bac2449c3a4b042126..60ffbe728178705b1734e682868614025214c2a4 100644 --- a/README.md +++ b/README.md @@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ## Installation -It is recommended to check out the -[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html) -before looking into the -[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html). +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website. ## Documentation -We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and -[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation. -- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html) +- [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html) - - You can also run distributed training jobs on Kubernetes clusters. - -- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index b520c03a836a9e3f263ba050f151877ffe0d071d..03c73786a6c31868b1893bfcb319e43e37db1a3d 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. +if (NOT WIN32) # windows msvc2015 support c++11 natively. +# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. list(APPEND CUDA_NVCC_FLAGS "-std=c++11") -list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +endif(NOT WIN32) + +list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") # in cuda9, suppress cuda warning on eigen list(APPEND CUDA_NVCC_FLAGS "-w") # Set :expt-relaxed-constexpr to suppress Eigen warnings list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") +if (NOT WIN32) if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "Release") @@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") # nvcc 9 does not support -Os. Use Release flags instead list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() +else(NOT WIN32) +if(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") +else() + message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.") +endif() +endif(NOT WIN32) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 7fb67afbe15a5a019c978092d5ba3a4a0f66d996..fd9835d023c67b76579913f2ec56c2444fea8c15 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -44,7 +44,7 @@ ExternalProject_Add( # 3. keep only zlib, cares, protobuf, boringssl under "third_party", # checkout and clean other dirs under third_party # 4. remove .git, and package the directory. - URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz" + URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz" URL_MD5 "1f268a2aff6759839dccd256adcc91cf" PREFIX ${GRPC_SOURCES_DIR} UPDATE_COMMAND "" diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index bc36683a9facc253e7b9feb0c5a56e79491fb9b0..f61770514eb05a99c140cdb18575c89aa5235c14 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(module "framework") if (NOT WIN32) -copy(framework_lib DEPS framework_py_proto - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} -) -else() -copy(framework_lib +set(framework_lib_deps framework_py_proto) +endif(NOT WIN32) +copy(framework_lib DEPS ${framework_lib_deps} SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} + ${src_dir}/${module}/ir/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir ) -endif(NOT WIN32) set(module "memory") copy(memory_lib @@ -161,7 +158,8 @@ set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst index 56fa928029903f1e3bd3e8064c146797f01b2b85..cca3684b78518867eae95d82e1347b52427ddc81 100644 --- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst +++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst @@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式: :maxdepth: 2 feeding_data - use_recordio_reader Python Reader ############# diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst deleted file mode 100644 index dfda33f1b03516fe2c704f55d095955282b19109..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. _user_guide_use_recordio_as_train_data: - -############################ -使用RecordIO文件作为训练数据 -############################ - -相比于 :ref:`user_guide_use_numpy_array_as_train_data`, -:ref:`user_guide_use_recordio_as_train_data` 的性能更好; -但是用户需要先将训练数据集转换成RecordIO文件格式,再使用 -:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。 -用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝, -使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。 - -将训练数据转换成RecordIO文件格式 -################################ - -:code:`fluid.recordio_writer` 中,每个记录都是一个 -:code:`vector`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需 -的所有特征。例如对于图像分类来说,这个数组可以包含图片和分类标签。 - -用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将 -:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用 -:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个 -:ref:`user_guide_reader` 转换成多个RecordIO文件。 - -具体使用方法为: - -.. code-block:: python - - import paddle.fluid as fluid - import numpy - - def reader_creator(): - def __impl__(): - for i in range(1000): - yield [ - numpy.random.random(size=[3,224,224], dtype="float32"), - numpy.random.random(size=[1], dtype="int64") - ] - return __impl__ - - img = fluid.layers.data(name="image", shape=[3, 224, 224]) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace()) - - BATCH_SIZE = 32 - reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE) - fluid.recordio_writer.convert_reader_to_recordio_file( - "train.recordio", feeder=feeder, reader_creator=reader) - -其中 :code:`reader_creator` 创建了一个 :code:`Reader`。 -:ref:`_api_fluid_data_feeder_DataFeeder` -是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考 -:ref:`user_guide_reader` 。 - -上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件, -其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整, -用户可以将每一个Record的样本数设置成1。并参考 -:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。 - - -配置神经网络, 打开RecordIO文件 -############################## - -RecordIO文件转换好之后,用户可以使用 :code:`fluid.layers.open_files()` -打开文件,并使用 :code:`fluid.layers.read_file` 读取文件内容。 -简单使用方法如下: - -.. code-block:: python - - import paddle.fluid as fluid - - file_obj = fluid.layers.open_files( - filenames=["train.recordio"], - shape=[[3, 224, 224], [1]], - lod_levels=[0, 0], - dtypes=["float32", "int64"], - pass_num=100 - ) - - image, label = fluid.layers.read_file(file_obj) - -其中如果设置了 :code:`pass_num` ,那么当所有数据读完后,会重新读取数据, -直到读取了 :code:`pass_num` 遍。 - - - -进阶使用 -######## - - -使用 :code:`fluid.layers.double_buffer()` ------------------------------------------- - -:code:`Double buffer` 使用双缓冲技术,将训练数据从内存中复制到显存中。配置双缓冲 -需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如: - -.. code-block:: python - - import paddle.fliud as fluid - file_obj = fluid.layers.open_files(...) - file_obj = fluid.layers.double_buffer(file_obj) - - image, label = fluid.layers.read_file(file_obj) - -双缓冲技术可以参考 -`Multiple buffering `_ 。 - -配置数据增强 ------------- - -使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如 - -.. code-block:: python - - import paddle.fluid as fluid - file_obj = fluid.layers.open_files(...) - preprocessor = fluid.layers.Preprocessor(reader=data_file) - with preprocessor.block(): - image, label = preprocessor.inputs() - image = image / 2 - label = label + 1 - preprocessor.outputs(image, label) - -如上代码所示,使用 :code:`Preprocessor` 定义了一个数据增强模块,并在 -:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置 -:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用 -:code:`preprocessor.outputs()` 标记预处理后的输出。 - -.. _user_guide_use_recordio_as_train_data_use_op_create_batch: - -使用Op组batch -------------- - -使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如 - -.. code-block:: python - - import paddle.fluid as fluid - file_obj = fluid.layers.open_files(...) - file_obj = fluid.layers.batch(file_obj, batch_size=32) - - img, label = fluid.layers.read_file(file_obj) - -需要注意的是,如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据, -那么这几个样本直接组成一个批量数据进行训练。 - -读入数据的shuffle ------------------ - -使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如 - -.. code-block:: python - - import paddle.fluid as fluid - file_obj = fluid.layers.open_files(...) - file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192) - - img, label = fliud.layers.read_file(file_obj) - -需要注意的是: - -1. :code:`shuffle` 实现方法是: -先读入 :code:`buffer_size` 条样本,再随机的选出样本进行训练。 - -2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存,需要确定训练过程中内存 -足够支持缓存 :code:`buffer_size` 条数据。 diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b6ae930b7155d15d24b287cc3eed50f2aeaa5599..bb5f2894c08b5d8941ad8914f6b83280aa053e37 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)) paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5132131e55e2feee8ae88b4c65ec102fbc9c5fe1 --- /dev/null +++ b/paddle/fluid/framework/.gitignore @@ -0,0 +1,2 @@ +.tensor_util.cu +.data_type_transform.cu \ No newline at end of file diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 0668ff43c8192f53ff7e05abaeb575e2b78b1de4..cc7938b2ac07f11ceb7f33a2e37380d1e2ed2072 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,22 @@ +# windows treat symbolic file as a real file, which is different with unix +# We create a hidden file and compile it instead of origin source file. +function(windows_symbolic TARGET) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(src ${windows_symbolic_SRCS}) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + add_custom_command(OUTPUT .${src}.cu + COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + endforeach() +endfunction() + add_subdirectory(ir) if (NOT WIN32) add_subdirectory(details) @@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) + if (WIN32) + windows_symbolic(tensor_util SRCS tensor_util.cu) + nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) + add_dependencies(tensor tensor_util) + else() + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) + endif(WIN32) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) endif() @@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu DEPS operator op_registry device_context math_function) if(WITH_GPU) - nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + if (WIN32) + windows_symbolic(hidden_file SRCS data_type_transform.cu) + nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) + add_dependencies(data_type_transform hidden_file) + else() + nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + endif(WIN32) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) else() cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bfc649017f19d67660bd11d590134cf56772bb27..f5235f70ad79616801110644999d511eeda33a32 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,20 +1,35 @@ +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") +function(pass_library TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass) + file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") + set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) +endfunction() + cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) -cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) -cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) -cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) -cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) -cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) + +pass_library(graph_to_program_pass) +pass_library(graph_viz_pass) +pass_library(fc_fuse_pass) +pass_library(attention_lstm_fuse_pass) +pass_library(infer_clean_graph_pass) +pass_library(fc_lstm_fuse_pass) +pass_library(seq_concat_fc_fuse_pass) +set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) -cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 4b3c0aa8d8ac10efb905e59e815504cc4ccf9bcd..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) { auto* cell_init = graph->RetriveNode(6); auto* hidden_init = graph->RetriveNode(8); -#define LINK_TO(node0, node1) \ - node0->outputs.push_back(node1); \ - node1->inputs.push_back(node0); - auto* lstm_op = graph->CreateOpNode(&op_desc); PrepareParameters(graph, param); - LINK_TO(X, lstm_op); - LINK_TO(cell_init, lstm_op); - LINK_TO(hidden_init, lstm_op); - LINK_TO(lstm_op, LSTMOUT); + IR_NODE_LINK_TO(X, lstm_op); + IR_NODE_LINK_TO(cell_init, lstm_op); + IR_NODE_LINK_TO(hidden_init, lstm_op); + IR_NODE_LINK_TO(lstm_op, LSTMOUT); GraphSafeRemoveNodes(graph, marked_nodes); } diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 513742bab69d465aac1bfb7bcef2fe89108c14a0..5a4ebd6f3de555acccd72c61bd377ffd8ce69780 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -21,74 +21,26 @@ namespace paddle { namespace framework { namespace ir { -bool VarOutLinksToOp(Node* node, const std::string& op_type) { - for (auto* out : node->outputs) { - if (out->IsOp() && out->Op()->Type() == op_type) { - return true; - } - } - return false; -} - -void BuildFCPattern(PDPattern* pattern) { - // Create Operators - auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul"); - auto* elementwise_add_op = - pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add"); - // Create variables - // w - auto* mul_weight_var = pattern->NewNode("mul_weight") - ->AsInput() - ->assert_is_op_nth_input("mul", "Y", 0); - // x - auto* mul_tmp_var = pattern->NewNode("mul_tmp_var") - ->AsInput() - ->assert_is_op_nth_input("mul", "X", 0); - // intermediate variable, will be removed in the IR after fuse. - auto* mul_out_var = pattern->NewNode("mul_out") - ->AsIntermediate() - ->assert_is_only_output_of_op("mul") - ->assert_is_op_input("elementwise_add"); - // bias - auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar") - ->assert_is_op_input("elementwise_add") - ->AsInput(); - // output - auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out") - ->AsOutput() - ->assert_is_op_output("elementwise_add"); - - mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var}); - elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var}) - .LinksTo({elementwise_add_out_var}); -} - -// Replace the node `from` in the links to `to` -bool LinksReplace(std::vector* links, Node* from, Node* to) { - for (auto*& n : *links) { - if (n == from) { - n = to; - return true; - } - } - return false; -} - std::unique_ptr FCFusePass::ApplyImpl( std::unique_ptr graph) const { PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("fc", graph.get()); + FusePassBase::Init("fc_fuse", graph.get()); std::unordered_set nodes2delete; GraphPatternDetector gpd; - BuildFCPattern(gpd.mutable_pattern()); - -#define GET_NODE(id) \ - PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ - "pattern has no Node called %s", #id); \ - auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ - PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + // BuildFCPattern(gpd.mutable_pattern()); + auto* x = gpd.mutable_pattern() + ->NewNode("fc_fuse/x") + ->AsInput() + ->assert_is_op_input("mul", "X"); + patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/); + +#define GET_NODE(id) \ + PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -98,43 +50,33 @@ std::unique_ptr FCFusePass::ApplyImpl( // scenerio. // FC's fusion is simple, just op fuse, no need to process the // parameters. - GET_NODE(mul_tmp_var); // x - GET_NODE(mul_weight); // Y - GET_NODE(elementwise_add_tmpvar); // bias - GET_NODE(elementwise_add_out); // Out - GET_NODE(mul); // MUL op - GET_NODE(elementwise_add); // ELEMENT_ADD op - GET_NODE(mul_out); // tmp + GET_NODE(x); // x + GET_NODE(w); // Y + GET_NODE(fc_bias); // bias + GET_NODE(fc_out); // Out + GET_NODE(mul); // MUL op + GET_NODE(elementwise_add); // ELEMENT_ADD op + GET_NODE(mul_out); // tmp #undef GET_NODE // Create an FC Node. OpDesc desc; - std::string fc_x_in = mul_tmp_var->Name(); - std::string fc_Y_in = mul_weight->Name(); - std::string fc_bias_in = elementwise_add_tmpvar->Name(); - std::string fc_out = elementwise_add_out->Name(); + std::string fc_x_in = x->Name(); + std::string fc_Y_in = w->Name(); + std::string fc_bias_in = fc_bias->Name(); + std::string fc_out_out = fc_out->Name(); desc.SetInput("Input", std::vector({fc_x_in})); desc.SetInput("W", std::vector({fc_Y_in})); desc.SetInput("Bias", std::vector({fc_bias_in})); - desc.SetOutput("Out", std::vector({fc_out})); + desc.SetOutput("Out", std::vector({fc_out_out})); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - fc_node->inputs = - std::vector({mul_tmp_var, mul_weight, elementwise_add_tmpvar}); - fc_node->outputs.push_back(elementwise_add_out); - - // Update link relatons - PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node)); - PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs, - elementwise_add, fc_node)); - PADDLE_ENFORCE( - LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node)); + GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); - // Drop old nodes - graph->RemoveNode(mul); - graph->RemoveNode(elementwise_add); - graph->RemoveNode(mul_out); // tmp variable + IR_NODE_LINK_TO(x, fc_node); + IR_NODE_LINK_TO(w, fc_node); + IR_NODE_LINK_TO(fc_bias, fc_node); + IR_NODE_LINK_TO(fc_node, fc_out); found_fc_count++; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index c29eb4d4a6224773326c210d47d38fa9bf289a00..00f5e7fad2ef5d42eb0de9703389e910090d93c1 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" +#include #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -94,21 +95,37 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("XX", {xx_n->Name()}); - op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"}); - op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"}); + op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"}); op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); - auto* op = graph->CreateOpNode(&op_desc); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + +#define TMP_NAME(x) "at.new.tmp." #x +#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)}) + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT -#define LINK_TO(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - LINK_TO(input_n, op); - LINK_TO(weight_x_n, op); - LINK_TO(weight_h_n, op); - LINK_TO(bias_n, op); - LINK_TO(op, hidden_n); -#undef LINK_TO + auto* op = graph->CreateOpNode(&op_desc); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + +#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable() + TMP_NEW(BatchedCell); + TMP_NEW(BatchedHidden); + TMP_NEW(ReorderedH0); + TMP_NEW(ReorderedC0); +#undef TMP_NEW +#undef TMP_NAME + + IR_NODE_LINK_TO(input_n, op); + IR_NODE_LINK_TO(weight_x_n, op); + IR_NODE_LINK_TO(weight_h_n, op); + IR_NODE_LINK_TO(bias_n, op); + IR_NODE_LINK_TO(op, hidden_n); return op; }; @@ -116,7 +133,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto fc_no_bias_handler = [&]( const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - #define GET_NODE(name__) \ std::string name__##key = name_scope + "/" + #name__; \ auto* name__##n = pattern->RetrieveNode(name__##key); \ diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 95d56f4735e1e12f75e49f47360810374acd2164..2b1a9b8d0942d98b8bacee359f452f3d139b481b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -111,6 +111,11 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { return false; } } + for (auto& item : pdnodes2nodes_) { + for (auto& n : item.second) { + GetMarkedNodes(const_cast(&graph)).insert(n); + } + } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); @@ -278,7 +283,7 @@ void GraphPatternDetector::RemoveOverlappedMatch( for (const auto& subgraph : *subgraphs) { bool valid = true; for (auto& item : subgraph) { - if (node_set.count(item.second)) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { valid = false; break; } @@ -334,22 +339,22 @@ PDNode& PDNode::LinksFrom(const std::vector& others) { } PDNode* PDNode::assert_is_op() { - asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsOp(); }); return this; } PDNode* PDNode::assert_is_op(const std::string& op_type) { - asserts_.emplace_back([this, op_type](Node* x) { + asserts_.emplace_back([op_type](Node* x) { return x && x->IsOp() && x->Op()->Type() == op_type; }); return this; } PDNode* PDNode::assert_is_var() { - asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); + asserts_.emplace_back([](Node* x) { return x && x->IsVar(); }); return this; } PDNode* PDNode::assert_var_not_persistable() { assert_is_var(); - asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); + asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); }); return this; } PDNode* PDNode::assert_is_persistable_var() { @@ -491,14 +496,16 @@ void GraphSafeRemoveNodes(Graph* graph, for (auto it = node->inputs.begin(); it != node->inputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->inputs.erase(it); - } else + } else { it++; + } } for (auto it = node->outputs.begin(); it != node->outputs.end();) { if (nodes.count(*it)) { it = const_cast(node)->outputs.erase(it); - } else + } else { it++; + } } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 024ce8ce55616cc5e0eaced4a27a6e1fb004af2c..9d67c4a6997dfe19561f37bf3ea76eba8b59ff35 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -245,6 +245,8 @@ class GraphPatternDetector { void UniquePatterns(std::vector* subgraphs); // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PDNodes will be removed, so can't shared by multiple + // patterns. void RemoveOverlappedMatch(std::vector* subgraphs); // Validate whether the intermediate nodes are linked by external nodes. @@ -295,6 +297,10 @@ PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); } // namespace patterns +#define IR_NODE_LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 7e5c86b033a7c69a306491cf4bf8d099018c5f19..6c466fb21fb46e09961dc874e9e39655f83d17c6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) { return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); }, "OP0"); - auto* any_var = x.mutable_pattern()->NewNode( - [](Node* node) { return node->IsVar(); }, "VAR"); + auto* any_var = x.mutable_pattern() + ->NewNode([](Node* node) { return node->IsVar(); }, "VAR") + ->AsIntermediate(); auto* any_op1 = x.mutable_pattern()->NewNode( [](Node* node) { return node->IsOp(); }, "OP1"); diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index f885567da1965b997b2063e06c839af95b43e1e1..7713ed1eab88ee4fa16d52e7425075ae66f721a3 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -13,42 +13,41 @@ // limitations under the License. #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { namespace ir { -class InferCleanGraphPass : public Pass { +class InferCleanGraphPass : public FusePassBase { public: virtual ~InferCleanGraphPass() {} protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + FusePassBase::Init("original_graph", graph.get()); PADDLE_ENFORCE(graph.get()); auto is_valid_node = [](Node* x) { return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); }; - std::unordered_set invalid_nodes; + std::unordered_set invalid_nodes; + int valid_op = 0; for (auto* node : graph->Nodes()) { if (is_valid_node(node)) { invalid_nodes.insert(node); + } else if (node->IsOp()) { + // Collect all the operators to help tracking number of operators. + ++valid_op; } } - // remove nodes from the graph. - for (auto* node : invalid_nodes) { - graph->RemoveNode(node); - } + GraphSafeRemoveNodes(graph.get(), invalid_nodes); - // clean edges. - for (auto* node : graph->Nodes()) { - CleanEdges(&node->inputs, invalid_nodes); - CleanEdges(&node->outputs, invalid_nodes); - } + AddStatis(valid_op); return graph; } diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index a776a898a5ee13b4dde12460dce71433268fb9d4..e1a441d09aaa3647c4b2a582210a2c7e2b64e0da 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -219,16 +219,13 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( op_desc.SetAttr("fc_activation", act->Op()->Type()); auto* op_node = graph->CreateOpNode(&op_desc); -// Add links -#define NODE_LINKS(a, b) \ - a->outputs.push_back(b); \ - b->inputs.push_back(a); - NODE_LINKS(fc_w, op_node); - NODE_LINKS(fc_bias, op_node); - NODE_LINKS(concat_in0, op_node); - NODE_LINKS(sequence_expand0_in, op_node); - NODE_LINKS(sequence_expand1_in, op_node); - NODE_LINKS(op_node, fc_out); + // Add links + IR_NODE_LINK_TO(fc_w, op_node); + IR_NODE_LINK_TO(fc_bias, op_node); + IR_NODE_LINK_TO(concat_in0, op_node); + IR_NODE_LINK_TO(sequence_expand0_in, op_node); + IR_NODE_LINK_TO(sequence_expand1_in, op_node); + IR_NODE_LINK_TO(op_node, fc_out); // Clean nodes. std::unordered_set marked_nodes; @@ -241,7 +238,6 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( marked_nodes.erase(sequence_expand0_in); marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(fc_out); - GraphSafeRemoveNodes(graph, marked_nodes); }); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 86392078b356df774fbc47aed9214e9f10fe33be..2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api SRCS io.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -22,7 +22,7 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) #endif() # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -32,6 +32,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index cc0dd0d492d42e9552c9ce081e268330599104f0..dadc8a53706fb9edff884dcf6d49168bfef3aa30 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -33,7 +33,7 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) @@ -56,25 +56,13 @@ if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING) endif() inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis - analysis_predictor - # ir - fc_fuse_pass - fc_lstm_fuse_pass - seq_concat_fc_fuse_pass - graph_viz_pass - infer_clean_graph_pass - graph_pattern_detector - infer_clean_graph_pass - attention_lstm_fuse_pass - paddle_inference_api - pass + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 0dd6f44028174ccb8b640344f71428a89c460460..d36c5bfb75b709aa3b8d9cfe070233168c642f6d 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); @@ -329,9 +330,20 @@ void TestDituRNNPrediction(bool use_analysis_and_activate_ir = false, LOG(INFO) << "fused " << item.first << " " << item.second; } - ASSERT_TRUE(fuse_statis.count("fc")); - EXPECT_EQ(fuse_statis.at("fc"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1); + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. } } @@ -348,10 +360,3 @@ TEST(Analyzer, DituRNN_multi_thread) { } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc index 6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd..367c25805d05f8d10fb8341158760ac6356a5c48 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" namespace paddle { namespace inference { @@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) { } // namespace analysis } // namespace inference } // namespace paddle - -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(attention_lstm_fuse_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index adfe4392448557a30cd834022b9a5d21d9086b95..3a43c72e33b3d5d8910b554021bb1c6a626edd93 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,7 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager - graph_viz_pass fc_fuse_pass - infer_clean_graph_pass - ) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 17310de28d471745ff74e8b01345709c1ebe46a1..90779d3e5aa59bedbb44d27c08c8bf4b77942cea 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -132,7 +133,3 @@ std::unique_ptr CreatePaddlePredictor< } } // namespace paddle - -USE_PASS(fc_fuse_pass); -USE_PASS(graph_viz_pass); -USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 026909ad8dbb6119d7cb3e720f162742a474754f..f98fe2d78871bdd921c993571fb792d107d16495 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644 --- a/paddle/fluid/inference/paddle_fluid.map +++ b/paddle/fluid/inference/paddle_fluid.map @@ -1,6 +1,7 @@ { global: *paddle*; + *Pass*; local: *; }; diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 912415192659dc004f54a76e9cd1a20581d512a6..2e31d1c9c708225135e27c93ba94722794c4b282 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = static_cast(1) / (static_cast(1) + (static_cast(-beta) * x).exp()); - auto temp2 = temp1 * (static_cast(1) - (beta * out)); - dx.device(d) = dout * ((beta * out) + temp2); + auto temp2 = temp1 * (static_cast(1) - (static_cast(beta) * out)); + dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } }; diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index a02128c5a54c80ca7ccf9db347cd53f28bbb50f8..39b0c856996c11c6efdb530f1396afd5731c778d 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" -#include #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0dee1781623d5a62830545c0952e5aadbe37accb --- /dev/null +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +/* + * transform that computes target bounding-box regression deltas + * given proposal boxes and ground-truth boxes. + */ +template +inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, + const framework::Tensor& gt_boxes, const T* weights, + const bool normalized, framework::Tensor* box_delta) { + auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); + auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); + auto trg = framework::EigenTensor::From(*box_delta); + T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; + for (int64_t i = 0; i < box_num; ++i) { + ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false); + ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false); + ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; + ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; + + gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false); + gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false); + gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; + gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; + + trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w; + trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h; + trg(i, 2) = std::log(gt_w / ex_w); + trg(i, 3) = std::log(gt_h / ex_h); + + if (weights) { + trg(i, 0) = trg(i, 0) / weights[0]; + trg(i, 1) = trg(i, 1) / weights[1]; + trg(i, 2) = trg(i, 2) / weights[2]; + trg(i, 3) = trg(i, 3) / weights[3]; + } + } +} + +template +void Gather(const T* in, const int in_stride, const int* index, const int num, + T* out) { + const int stride_bytes = in_stride * sizeof(T); + for (int i = 0; i < num; ++i) { + int id = index[i]; + memcpy(out + i * in_stride, in + id * in_stride, stride_bytes); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 0571c46f6be99c9a06b7dd2abb310eeda506ecd5..be06dc19743cfa6f093bcb3f4e9f91af315d4211 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/math_function.h" @@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, } } -template -void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes, - const std::vector& weights, Tensor* box_delta) { - auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); - auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); - auto box_delta_et = framework::EigenTensor::From(*box_delta); - T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; - for (int64_t i = 0; i < box_num; ++i) { - ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1; - ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1; - ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; - ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; - - gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1; - gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1; - gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; - gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; - - box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]; - box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]; - box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2]; - box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3]; - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, @@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, Tensor* sampled_labels, Tensor* sampled_gts) { int fg_num = fg_inds.size(); int bg_num = bg_inds.size(); - int gt_num = fg_num + bg_num; Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; int* fg_inds_data = fg_inds_t.mutable_data({fg_num}, context.GetPlace()); int* bg_inds_data = bg_inds_t.mutable_data({bg_num}, context.GetPlace()); int* gt_box_inds_data = - gt_box_inds_t.mutable_data({gt_num}, context.GetPlace()); + gt_box_inds_t.mutable_data({fg_num}, context.GetPlace()); int* gt_label_inds_data = gt_label_inds_t.mutable_data({fg_num}, context.GetPlace()); std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); @@ -303,18 +278,20 @@ std::vector SampleRoisForOneImage( // Gather boxes and labels Tensor sampled_boxes, sampled_labels, sampled_gts; - int boxes_num = fg_inds.size() + bg_inds.size(); + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + int boxes_num = fg_num + bg_num; framework::DDim bbox_dim({boxes_num, kBoxDim}); sampled_boxes.mutable_data(bbox_dim, context.GetPlace()); sampled_labels.mutable_data({boxes_num}, context.GetPlace()); - sampled_gts.mutable_data(bbox_dim, context.GetPlace()); + sampled_gts.mutable_data({fg_num, kBoxDim}, context.GetPlace()); GatherBoxesLabels(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights, + BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, &bbox_targets_single); // Scale rois @@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); - for (size_t i = 0; i < n; ++i) { + for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index fcdcafae7273afa6887ee531dfc37ef833b92d68..ebe6830eccd87a156768eb0d4b96220bcc9f4edc 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel { rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, context.GetPlace()); - rpn_roi_probs->mutable_data({scores->numel() / 4, 1}, - context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); Tensor bbox_deltas_swap, scores_swap; bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, @@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel { CPUGather(ctx, proposals, keep, &bbox_sel); CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { - return std::make_pair(bbox_sel, scores_sel); + return std::make_pair(bbox_sel, scores_filter); } Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 177ff7cf187bc9daf69889e99ca57ae18766de90..88757f25cd9a5789758640de2d9cae0b12350b25 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto in_dims = ctx->GetInputDim("DistMat"); PADDLE_ENFORCE_EQ(in_dims.size(), 2, "The rank of Input(DistMat) must be 2."); + + ctx->SetOutputDim("LocationIndex", {-1}); + ctx->SetOutputDim("ScoreIndex", {-1}); + ctx->SetOutputDim("TargetLabel", {-1, 1}); + ctx->SetOutputDim("TargetBBox", {-1, 4}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("DistMat")->type()), + platform::CPUPlace()); } }; template class RpnTargetAssignKernel : public framework::OpKernel { public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_bbox_t = context.Input("GtBox"); + auto* dist_t = context.Input("DistMat"); + + auto* loc_index_t = context.Output("LocationIndex"); + auto* score_index_t = context.Output("ScoreIndex"); + auto* tgt_bbox_t = context.Output("TargetBBox"); + auto* tgt_lbl_t = context.Output("TargetLabel"); + + auto lod = dist_t->lod().back(); + int64_t batch_num = static_cast(lod.size() - 1); + int64_t anchor_num = dist_t->dims()[1]; + PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + + int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); + float pos_threshold = context.Attr("rpn_positive_overlap"); + float neg_threshold = context.Attr("rpn_negative_overlap"); + float fg_fraction = context.Attr("fg_fraction"); + + int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + + int64_t max_num = batch_num * anchor_num; + auto place = context.GetPlace(); + + tgt_bbox_t->mutable_data({max_num, 4}, place); + auto* loc_index = loc_index_t->mutable_data({max_num}, place); + auto* score_index = score_index_t->mutable_data({max_num}, place); + + Tensor tmp_tgt_lbl; + auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); + auto& dev_ctx = context.device_context(); + math::SetConstant iset; + iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); + + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + int fg_num = 0; + int bg_num = 0; + for (int i = 0; i < batch_num; ++i) { + Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); + Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); + auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, + rpn_batch_size, fg_num_per_batch, engine, + tmp_lbl_data + i * anchor_num); + + int cur_fg_num = fg_bg_gt[0].size(); + int cur_bg_num = fg_bg_gt[1].size(); + std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, + [i, anchor_num](int d) { return d + i * anchor_num; }); + memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); + std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), + score_index + cur_fg_num, + [i, anchor_num](int d) { return d + i * anchor_num; }); + + // get target bbox deltas + if (cur_fg_num) { + Tensor fg_gt; + T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); + Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); + T* tgt_data = tgt_bbox.data(); + Gather(anchor_t->data(), 4, + reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, + tgt_data); + Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), + cur_fg_num, gt_data); + BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); + } + + loc_index += cur_fg_num; + score_index += cur_fg_num + cur_bg_num; + fg_num += cur_fg_num; + bg_num += cur_bg_num; + } + + int lbl_num = fg_num + bg_num; + PADDLE_ENFORCE_LE(fg_num, max_num); + PADDLE_ENFORCE_LE(lbl_num, max_num); + + tgt_bbox_t->Resize({fg_num, 4}); + loc_index_t->Resize({fg_num}); + score_index_t->Resize({lbl_num}); + auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); + Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, + lbl_data); + } + + private: void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label_data, + const float neg_threshold, int64_t* target_label, std::vector* fg_inds, std::vector* bg_inds) const { - int fg_offset = fg_inds->size(); - int bg_offset = bg_inds->size(); + float epsilon = 0.0001; for (int64_t i = 0; i < row; ++i) { const T* v = dist_data + i * col; - T max_dist = *std::max_element(v, v + col); + T max = *std::max_element(v, v + col); for (int64_t j = 0; j < col; ++j) { - T val = dist_data[i * col + j]; - if (val == max_dist) target_label_data[j] = 1; + if (std::abs(max - v[j]) < epsilon) { + target_label[j] = 1; + } } } - // Pick the fg/bg and count the number + // Pick the fg/bg + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max.data()[j] > pos_threshold) { - target_label_data[j] = 1; - } else if (anchor_to_gt_max.data()[j] < neg_threshold) { - target_label_data[j] = 0; + if (anchor_to_gt_max_data[j] >= pos_threshold) { + target_label[j] = 1; + } else if (anchor_to_gt_max_data[j] < neg_threshold) { + target_label[j] = 0; } - if (target_label_data[j] == 1) { - fg_inds->push_back(fg_offset + j); - } else if (target_label_data[j] == 0) { - bg_inds->push_back(bg_offset + j); + if (target_label[j] == 1) { + fg_inds->push_back(j); + } else if (target_label[j] == 0) { + bg_inds->push_back(j); } } } - void ReservoirSampling(const int num, const int offset, - std::minstd_rand engine, + void ReservoirSampling(const int num, std::minstd_rand engine, std::vector* inds) const { std::uniform_real_distribution uniform(0, 1); - const int64_t size = static_cast(inds->size() - offset); - if (size > num) { - for (int64_t i = num; i < size; ++i) { + size_t len = inds->size(); + if (len > static_cast(num)) { + for (size_t i = num; i < len; ++i) { int rng_ind = std::floor(uniform(engine) * i); if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind + offset, - inds->begin() + i + offset); + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); } + inds->resize(num); } } - void RpnTargetAssign(const framework::ExecutionContext& ctx, - const Tensor& dist, const float pos_threshold, - const float neg_threshold, const int rpn_batch_size, - const int fg_num, std::minstd_rand engine, - std::vector* fg_inds, std::vector* bg_inds, - int64_t* target_label_data) const { + // std::vector> RpnTargetAssign( + std::vector> SampleFgBgGt( + const platform::CPUDeviceContext& ctx, const Tensor& dist, + const float pos_threshold, const float neg_threshold, + const int rpn_batch_size, const int fg_num, std::minstd_rand engine, + int64_t* target_label) const { auto* dist_data = dist.data(); - int64_t row = dist.dims()[0]; - int64_t col = dist.dims()[1]; - int fg_offset = fg_inds->size(); - int bg_offset = bg_inds->size(); + int row = dist.dims()[0]; + int col = dist.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; // Calculate the max IoU between anchors and gt boxes - Tensor anchor_to_gt_max; - anchor_to_gt_max.mutable_data( - framework::make_ddim({static_cast(col), 1}), - platform::CPUPlace()); - auto& place = *ctx.template device_context() - .eigen_device(); - auto x = EigenMatrix::From(dist); - auto x_col_max = EigenMatrix::From(anchor_to_gt_max); - x_col_max.device(place) = - x.maximum(Eigen::DSizes(0)) - .reshape(Eigen::DSizes(static_cast(col), 1)); + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax; + anchor_to_gt_max.mutable_data({col}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); + + auto x = framework::EigenMatrix::From(dist); + auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); + auto x_col_argmax = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + x_col_max = x.maximum(Eigen::DSizes(0)); + x_col_argmax = x.argmax(0).template cast(); + // Follow the Faster RCNN's implementation ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label_data, fg_inds, bg_inds); + neg_threshold, target_label, &fg_inds, &bg_inds); // Reservoir Sampling - ReservoirSampling(fg_num, fg_offset, engine, fg_inds); - int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset); - ReservoirSampling(bg_num, bg_offset, engine, bg_inds); - } + ReservoirSampling(fg_num, engine, &fg_inds); + int fg_num2 = static_cast(fg_inds.size()); + int bg_num = rpn_batch_size - fg_num2; + ReservoirSampling(bg_num, engine, &bg_inds); - void Compute(const framework::ExecutionContext& context) const override { - auto* dist = context.Input("DistMat"); - auto* loc_index = context.Output("LocationIndex"); - auto* score_index = context.Output("ScoreIndex"); - auto* tgt_lbl = context.Output("TargetLabel"); - - auto col = dist->dims()[1]; - int64_t n = dist->lod().size() == 0UL - ? 1 - : static_cast(dist->lod().back().size() - 1); - if (dist->lod().size()) { - PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL, - "Only support 1 level of LoD."); + gt_inds.reserve(fg_num2); + for (int i = 0; i < fg_num2; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); - - int fg_num = static_cast(rpn_batch_size * fg_fraction); - - int64_t* target_label_data = - tgt_lbl->mutable_data({n * col, 1}, context.GetPlace()); + std::vector> fg_bg_gt; + fg_bg_gt.emplace_back(fg_inds); + fg_bg_gt.emplace_back(bg_inds); + fg_bg_gt.emplace_back(gt_inds); - auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, tgt_lbl, static_cast(-1)); - - std::vector fg_inds; - std::vector bg_inds; - std::random_device rnd; - std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); - engine.seed(seed); - - if (n == 1) { - RpnTargetAssign(context, *dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, - target_label_data); - } else { - auto lod = dist->lod().back(); - for (size_t i = 0; i < lod.size() - 1; ++i) { - Tensor one_ins = dist->Slice(lod[i], lod[i + 1]); - RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold, - rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, - target_label_data + i * col); - } - } - int* loc_index_data = loc_index->mutable_data( - {static_cast(fg_inds.size())}, context.GetPlace()); - int* score_index_data = score_index->mutable_data( - {static_cast(fg_inds.size() + bg_inds.size())}, - context.GetPlace()); - memcpy(loc_index_data, reinterpret_cast(&fg_inds[0]), - fg_inds.size() * sizeof(int)); - memcpy(score_index_data, reinterpret_cast(&fg_inds[0]), - fg_inds.size() * sizeof(int)); - memcpy(score_index_data + fg_inds.size(), - reinterpret_cast(&bg_inds[0]), bg_inds.size() * sizeof(int)); + return fg_bg_gt; } }; class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddInput("Anchor", + "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); + AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); AddInput( "DistMat", "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " @@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex", "(Tensor), The indexes of foreground and background anchors in all " "RPN anchors(The rest anchors are ignored). The shape of the " - "ScoreIndex is [F + B], F and B depend on the value of input " - "tensor and attributes."); - AddOutput("TargetLabel", - "(Tensor), The target labels of each anchor with shape " - "[K * M, 1], " - "K and M is the same as they are in DistMat."); + "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " + " number."); + AddOutput("TargetBBox", + "(Tensor), The target bbox deltas with shape " + "[F, 4], F is the sampled foreground number."); + AddOutput( + "TargetLabel", + "(Tensor), The target labels of each anchor with shape " + "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( This operator can be, for given the IoU between the ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index d5b9b2dac085e7abc31ef243be82eaa815d387ba..b1a399c22c2b9ed7464a1b1764478803d4416d94 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -94,8 +95,11 @@ class RowwiseTransformIterator; template class MidWiseTransformIterator; +// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17 template -class RowwiseTransformIterator { +class RowwiseTransformIterator + : public std::iterator { public: RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {} @@ -126,7 +130,9 @@ class RowwiseTransformIterator { }; template -class MidWiseTransformIterator { +class MidWiseTransformIterator + : public std::iterator { public: MidWiseTransformIterator(const T *ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} @@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast( const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { size_t N = static_cast(framework::product(x_dim)); +#if !defined(_WIN32) platform::ForRange for_range( ctx.template device_context(), N); +#else + platform::ForRange for_range( + ctx.device_context(), N); +#endif // !_WIN32 for_range(ElemwiseGradNoBroadcast{ x.data(), y.data(), out.data(), dout.data(), dx_op, dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), @@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx, template + void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { TransformFunctor functor( x, y, z, ctx.template device_context(), func); - auto x_dims = x->dims(); auto y_dims_untrimed = y->dims(); PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 3a34aa86b6331e4fe2813eea97cb6644323807c3..582c75872ab2818cdf834f9a46278db1d6f91d54 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -13,16 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fusion_gru_op.h" +#include // for memcpy #include -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace operators { @@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { "Input(WeightH) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), - "Output(BatchedGate) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), - "Output(BatchResetHiddenPrev) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), - "Output(BatchedHidden) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), + "Output(ReorderedH0) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), + "Output(BatchedInput) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), + "Output(BatchedOut) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), "Output(Hidden) of GRU should not be null."); @@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { } framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); - ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); - ctx->SetOutputDim("BatchedHidden", out_dims); - ctx->SetOutputDim("BatchResetHiddenPrev", out_dims); + ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchedOut", out_dims); ctx->ShareLoD("X", "Hidden"); - int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + int xx_width; + if (ctx->Attrs().Get("use_seq")) { + xx_width = wx_dims[1]; + } else { + xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + } ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->ShareLoD("X", "XX"); } @@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() { "(Tensor) The FC weight with shape (M x 3D)," "where M is the dim size of x, D is the hidden size. "); AddInput("WeightH", - "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "); + "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. " + "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}" + "Acutally they are D x 2D and D x D two part weights." + "{W_update, W_reset; W_state}" + "{D x (D + D); D x D}"); AddInput("Bias", "(Tensor, optional) (1 x 3D)." "Almost same as GRUOp." "Note: if have FC bias it should be added on this bias.") .AsDispensable(); + AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.") + .AsIntermediate(); AddOutput("XX", - "(LoDTensor) the result after X * WeightX (size is T x 4D)" + "(LoDTensor) the result after X * WeightX (size is T x 3D)" " or batched_X (size is T x M), this will be automatically chosen," " where T is the total time steps in this mini-batch," " D is the hidden size, M is the dim size of x input.") .AsIntermediate(); - AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate(); - AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.") + AddOutput("BatchedInput", + "(LoDTensor) This is the batched result of input X" + "or the batched result after fc, shape (T x 3D)") .AsIntermediate(); - AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.") + AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.") .AsIntermediate(); AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp"); AddAttr("activation", @@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() { "(bool, defalut: False) " "whether to compute reversed GRU.") .SetDefault(false); + AddAttr("use_seq", + "(bool, defalut: True) " + "whether to use seq mode to compute GRU.") + .SetDefault(true); AddComment(R"DOC( The Fusion complete GRU Operator. This operator fuse the fully-connected operator into GRU, @@ -153,172 +165,261 @@ more details can refer to GRU op. )DOC"); } -template -inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, - framework::Vector index_lod, - framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims(), ctx.GetPlace()); - row_shuffle(ctx, src, index_lod, dst, indexed_src); -} - -template +template class FusionGRUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + if (ctx.Attr("use_seq")) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } + +#define INIT_VEC_FUNC \ + std::function act_gate, act_state; \ + std::function cross; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_state_str = ctx.Attr("activation"); \ + if (platform::jit::MayIUse(platform::jit::avx)) { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_state = act_functor(act_state_str); \ + cross = math::vec_cross; \ + } else { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_state = act_functor(act_state_str); \ + cross = math::vec_cross; \ + } + +#define INIT_BASE_INPUT_OUTPUT \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); + +#define INIT_BASE_SIZES \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 3D*/ \ + const int total_T = x_dims[0]; \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D3 = wh_dims[1]; \ + const int D2 = D * 2; + + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; auto* x = ctx.Input("X"); - auto* wx = ctx.Input("WeightX"); - auto* wh = ctx.Input("WeightH"); - auto* bias = ctx.Input("Bias"); - auto* h0 = ctx.Input("H0"); - - auto* xx = ctx.Output("XX"); - auto* batched_gate = ctx.Output("BatchedGate"); - auto* batch_reset_hidden_prev = - ctx.Output("BatchResetHiddenPrev"); - auto* batch_hidden = ctx.Output("BatchedHidden"); - auto* hidden_out = ctx.Output("Hidden"); - bool is_reverse = ctx.Attr("is_reverse"); + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC + auto x_lod = x->lod(); + const int N = x_lod[0].size() - 1; + const T* x_data = x->data(); + const T* h0_data = h0 ? h0->data() : nullptr; + const T* wx_data = wx->data(); + const T* wh_data = wh->data(); + const T* wh_state_data = wh_data + D * D2; T* xx_data = xx->mutable_data(ctx.GetPlace()); - T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); - batch_reset_hidden_prev->mutable_data(ctx.GetPlace()); - batch_hidden->mutable_data(ctx.GetPlace()); - hidden_out->mutable_data(ctx.GetPlace()); + T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, D3, M, x_data, wx_data, + xx_data, + bias ? bias->data() : nullptr); + + int xx_offset = D3; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 3; + hidden_out_data = hidden_out_data + offset; + xx_offset = -D3; + gate_offset = -D; + } + auto move_step = [&]() { + xx_data = xx_data + xx_offset; + hidden_out_data = hidden_out_data + gate_offset; + }; + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_hidden_data = nullptr; + int tstart = 0; + if (h0_data) { + prev_hidden_data = h0_data + bid * D; + } else { + // W: {W_update, W_reset; W_state} + // update gate + act_gate(D, xx_data, xx_data); + // state gate + act_state(D, xx_data + D2, xx_data + D2); + // out = a*b + blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data); + // save prev + prev_hidden_data = hidden_out_data; + tstart = 1; + move_step(); + } + for (int step = tstart; step < seq_len; ++step) { + // gemm prev * (Wu + Wr) + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast(1), + prev_hidden_data, D, wh_data, D2, static_cast(1), xx_data, + D3); + act_gate(D2, xx_data, xx_data); + // rt = rt*ht_1 inplace result + blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data); + + // gemm rt * Ws + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), + hidden_out_data, D, wh_state_data, D, static_cast(1), + xx_data + D2, D3); + act_state(D, xx_data + D2, xx_data + D2); + // out = zt*ht~ + (1-zt)*ht_1 + cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data); + // save prev + prev_hidden_data = hidden_out_data; + move_step(); + } + } + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto* x = ctx.Input("X"); + if (x->lod()[0].size() == 2) { + SeqCompute(ctx); + return; + } + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC + + auto* reordered_h0 = ctx.Output("ReorderedH0"); + auto* batched_input = ctx.Output("BatchedInput"); + auto* batched_out = ctx.Output("BatchedOut"); const T* x_data = x->data(); const T* wx_data = wx->data(); const T* wh_data = wh->data(); - auto x_dims = x->dims(); - auto wx_dims = wx->dims(); + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* batched_input_data = batched_input->mutable_data(ctx.GetPlace()); + T* batched_out_data = batched_out->mutable_data(ctx.GetPlace()); + hidden_out->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); math::LoDTensor2BatchFunctor to_batch; - if (x_dims[1] > wx_dims[1]) { - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - x_data, wx_data, xx_data, - bias ? bias->data() : NULL); - to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + if (M > D3) { + math::FCCompute(blas, total_T, D3, M, x_data, wx_data, + xx_data, + bias ? bias->data() : nullptr); + to_batch(dev_ctx, *xx, batched_input, true, is_reverse); } else { to_batch(dev_ctx, *x, xx, true, is_reverse); - batched_gate->set_lod(xx->lod()); - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - xx_data, wx_data, batched_gate_data, - bias ? bias->data() : NULL); + batched_input->set_lod(xx->lod()); + math::FCCompute(blas, total_T, D3, M, xx_data, wx_data, + batched_input_data, + bias ? bias->data() : nullptr); } - int frame_size = static_cast(wx_dims[1] / 3); - math::GRUMetaValue gru_value; - gru_value.gate_weight = const_cast(wh_data); - gru_value.state_weight = - const_cast(wh_data + 2 * frame_size * frame_size); - Tensor ordered_h0; - - framework::Vector order(batched_gate->lod()[2]); + auto batched_lod = batched_input->lod(); + const auto& seq_order = batched_lod[2]; + const int max_bs = seq_order.size(); + reordered_h0->Resize({max_bs, D}); + int tstart = 0; + T* prev_hidden_data = nullptr; if (h0) { - ReorderInitState( - ctx.template device_context(), *h0, order, &ordered_h0, - true); - gru_value.prev_out_value = ordered_h0.data(); + // reorder h0 + T* reordered_h0_data = reordered_h0->mutable_data(ctx.GetPlace()); + const T* h0_data = h0->data(); + prev_hidden_data = reordered_h0_data; + size_t sz = sizeof(T) * D; + for (int i = 0; i < max_bs; ++i) { + std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); + reordered_h0_data += D; + } } else { - gru_value.prev_out_value = nullptr; + // compute without h0 + T* cur_in_data = batched_input_data; + T* cur_out_data = batched_out_data; + // W: {W_update, W_reset; W_state} + for (int i = 0; i < max_bs; ++i) { + // update gate + act_gate(D, cur_in_data, cur_in_data); + // state gate + act_state(D, cur_in_data + D2, cur_in_data + D2); + // out = a*b + blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data); + // add offset + cur_in_data += D3; + cur_out_data += D; + } + tstart = 1; + prev_hidden_data = batched_out_data; } - auto batch_starts = batched_gate->lod()[0]; - size_t seq_len = batch_starts.size() - 1; - auto active_node = - math::detail::GetActivationType(ctx.Attr("activation")); - auto active_gate = math::detail::GetActivationType( - ctx.Attr("gate_activation")); - -#ifdef PADDLE_WITH_MKLML - // use MKL packed to speedup GEMM - if (FLAGS_paddle_num_threads >= 4) { - auto blas = math::GetBlas(dev_ctx); - T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, - frame_size * 2 /*width of weight*/, - frame_size /*height of height*/); - PADDLE_ENFORCE(packed_gate); - blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2, - frame_size, T(1.0), gru_value.gate_weight, frame_size * 2, - packed_gate); - T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, - frame_size /*width of weight*/, - frame_size /*height of height*/); - PADDLE_ENFORCE(packed_state); - blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size, - frame_size, T(1.0), gru_value.state_weight, frame_size, - packed_state); - for (size_t n = 0; n < seq_len; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - int cur_batch_size = bend - bstart; - - Tensor gate_t = batched_gate->Slice(bstart, bend); - Tensor reset_hidden_prev_t = - batch_reset_hidden_prev->Slice(bstart, bend); - Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.data(); - gru_value.gate_value = gate_t.data(); - gru_value.reset_output_value = reset_hidden_prev_t.data(); - - if (gru_value.prev_out_value) { - blas.GEMM_COMPUTE( - CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2, - frame_size, gru_value.prev_out_value, frame_size, packed_gate, - frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); - } - - math::detail::forward_reset_output( - math::detail::forward::gru_resetOutput(), gru_value, frame_size, - cur_batch_size, active_gate); - - if (gru_value.prev_out_value) { - blas.GEMM_COMPUTE( - CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size, - gru_value.reset_output_value, frame_size, packed_state, - frame_size, T(1), gru_value.gate_value + frame_size * 2, - frame_size * 3); - } - - math::detail::forward_final_output( - math::detail::forward::gru_finalOutput(), gru_value, frame_size, - cur_batch_size, active_node); - - gru_value.prev_out_value = gru_value.output_value; + // Then start from next + const T* wh_state_data = wh_data + D * D2; + const auto& batch_starts = batched_lod[0]; + const int max_seq_len = batch_starts.size() - 1; + batched_input_data = batched_input_data + tstart * max_bs * D3; + batched_out_data = batched_out_data + tstart * max_bs * D; + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + // gemm prev * (Wu + Wr) + blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast(1), + prev_hidden_data, D, wh_data, D2, static_cast(1), + batched_input_data, D3); + + T* cur_batched_data = batched_input_data; + T* cur_out_data = batched_out_data; + T* cur_prev_hidden_data = prev_hidden_data; + for (int i = 0; i < cur_bs; ++i) { + act_gate(D2, cur_batched_data, cur_batched_data); + // rt = rt*ht_1 inplace result + blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data); + + cur_batched_data += D3; + cur_prev_hidden_data += D; + cur_out_data += D; } - blas.GEMM_FREE(packed_gate); - blas.GEMM_FREE(packed_state); - } else { -#endif - for (size_t n = 0; n < seq_len; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - int cur_batch_size = bend - bstart; - - Tensor gate_t = batched_gate->Slice(bstart, bend); - Tensor reset_hidden_prev_t = - batch_reset_hidden_prev->Slice(bstart, bend); - Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.data(); - gru_value.gate_value = gate_t.data(); - gru_value.reset_output_value = reset_hidden_prev_t.data(); - - math::GRUUnitFunctor::compute( - dev_ctx, gru_value, frame_size, cur_batch_size, active_node, - active_gate); - - gru_value.prev_out_value = gru_value.output_value; + cur_batched_data = batched_input_data; + cur_out_data = batched_out_data; + blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast(1), + cur_out_data, D, wh_state_data, D, static_cast(1), + cur_batched_data + D2, D3); + + cur_prev_hidden_data = prev_hidden_data; + for (int i = 0; i < cur_bs; ++i) { + // ht~ = act_state(...) + act_state(D, cur_batched_data + D2, cur_batched_data + D2); + // out = zt*ht~ + (1-zt)*ht_1 + cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data, + cur_out_data); + + cur_batched_data += D3; + cur_prev_hidden_data += D; + cur_out_data += D; } -#ifdef PADDLE_WITH_MKLML + prev_hidden_data = batched_out_data; + batched_out_data = cur_out_data; + batched_input_data = cur_batched_data; } -#endif + math::Batch2LoDTensorFunctor to_seq; - batch_hidden->set_lod(batched_gate->lod()); - to_seq(dev_ctx, *batch_hidden, hidden_out); + batched_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_out, hidden_out); } +#undef INIT_VEC_FUNC +#undef INIT_BASE_SIZES +#undef INIT_BASE_INPUT_OUTPUT }; } // namespace operators @@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OP_CPU_KERNEL( - fusion_gru, ops::FusionGRUKernel, - ops::FusionGRUKernel); +REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel, + ops::FusionGRUKernel); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index e4e4ac8e333ba423e151dea05e40a0e41042570e..f91236975d0cf0c89a464188bd6ea1b5b01e0f6d 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -16,14 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc_compute.h" -#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/cpu_info.h" -DEFINE_bool(seq_mode, true, "Use sequence mode"); - namespace paddle { namespace operators { @@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { "Output(Hidden) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), "Output(Cell) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), - "Output(BatchedGate) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), - "Output(BatchedGate) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), + "Output(BatchedInput) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Output(BatchedHidden) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), + "Output(BatchedCell) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), + "Output(ReorderedH0) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), + "Output(ReorderedC0) of LSTM should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -97,13 +99,14 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Cell", out_dims); - ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); - ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchedCell", out_dims); ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Cell"); int xx_width; - if (FLAGS_seq_mode) { + if (ctx->Attrs().Get("use_seq")) { xx_width = wx_dims[1]; } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; @@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() { " where T is the total time steps in this mini-batch," " D is the hidden size, M is the dim size of x input.") .AsIntermediate(); - AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); - AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") - .AsIntermediate(); + AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() { "(bool, defalut: False) " "whether to compute reversed LSTM.") .SetDefault(false); + AddAttr("use_seq", + "(bool, defalut: True) " + "whether to use seq mode to compute.") + .SetDefault(true); AddAttr("gate_activation", "(string, default: sigmoid)" "The activation for input gate, forget gate and output " @@ -203,64 +212,60 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. )DOC"); } -template -inline void ReorderInitState(const DeviceContext& ctx, - const framework::Tensor& src, - framework::Vector index_lod, - framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; - dst->mutable_data(src.dims(), ctx.GetPlace()); - // TODO(TJ): check mem copy perf - row_shuffle(ctx, src, index_lod, dst, indexed_src); -} - template class FuisonLSTMKernel : public framework::OpKernel { public: +#define INIT_VEC_FUNC \ + std::function act_gate, act_cell, act_cand; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_cell_str = ctx.Attr("cell_activation"); \ + auto& act_cand_str = ctx.Attr("candidate_activation"); \ + if (platform::jit::MayIUse(platform::jit::avx)) { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } else { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } + +#define INIT_BASE_INPUT_OUTPUT \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); + +#define INIT_BASE_SIZES \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + const int D4 = wh_dims[1]; + void SeqCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = paddle::platform::CPUDeviceContext; - auto* x = ctx.Input("X"); - auto* h0 = ctx.Input("H0"); - auto* c0 = ctx.Input("C0"); - auto* wx = ctx.Input("WeightX"); - auto* wh = ctx.Input("WeightH"); - auto* bias = ctx.Input("Bias"); - - auto* xx = ctx.Output("XX"); - auto* hidden_out = ctx.Output("Hidden"); - auto* cell_out = ctx.Output("Cell"); - bool is_reverse = ctx.Attr("is_reverse"); - - std::function act_gate, act_cell, act_cand; - auto& act_gate_str = ctx.Attr("gate_activation"); - auto& act_cell_str = ctx.Attr("cell_activation"); - auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; - act_gate = act_functor(act_gate_str); - act_cell = act_functor(act_cell_str); - act_cand = act_functor(act_cand_str); - } else { - math::VecActivations act_functor; - act_gate = act_functor(act_gate_str); - act_cell = act_functor(act_cell_str); - act_cand = act_functor(act_cand_str); - } + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC auto x_lod = x->lod(); - auto x_dims = x->dims(); // T x M - auto wh_dims = wh->dims(); // D x 4D const int total_T = x_dims[0]; const int N = x_lod[0].size() - 1; // batch size - const int M = x_dims[1]; // x frame size - const int D = wh_dims[0]; - const int D2 = D * 2; - const int D3 = D * 3; - const int D4 = wh_dims[1]; const T* x_data = x->data(); - const T* h0_data = h0 ? h0->data() : NULL; - const T* c0_data = c0 ? c0->data() : NULL; + const T* h0_data = h0 ? h0->data() : nullptr; + const T* c0_data = c0 ? c0->data() : nullptr; const T* wx_data = wx->data(); const T* wh_data = wh->data(); T* xx_data = xx->mutable_data(ctx.GetPlace()); @@ -290,12 +295,12 @@ class FuisonLSTMKernel : public framework::OpKernel { for (int i = 0; i < N; ++i) { int bid = is_reverse ? N - 1 - i : i; int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; - const T* prev_cell_data = NULL; - const T* prev_hidden_data = NULL; + const T* prev_c_data = nullptr; + const T* prev_h_data = nullptr; int tstart = 0; if (h0_data) { - prev_hidden_data = h0_data + bid * D; - prev_cell_data = c0_data + bid * D; + prev_h_data = h0_data + bid * D; + prev_c_data = c0_data + bid * D; } else { // W_ch, W_ih, W_fh, W_oh act_gate(D3, xx_data + D, xx_data + D); @@ -307,23 +312,22 @@ class FuisonLSTMKernel : public framework::OpKernel { blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); // prev - prev_hidden_data = hidden_out_data; - prev_cell_data = cell_out_data; + prev_h_data = hidden_out_data; + prev_c_data = cell_out_data; tstart = 1; move_step(); } for (int step = tstart; step < seq_len; ++step) { blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), - prev_hidden_data, D, wh_data, D4, static_cast(1), xx_data, - D4); + prev_h_data, D, wh_data, D4, static_cast(1), xx_data, D4); // W_ch, W_ih, W_fh, W_oh act_gate(D3, xx_data + D, xx_data + D); act_cand(D, xx_data, xx_data); // a = forget * prev_cell - blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2); + blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2); // b = input * tilde blas.VMUL(D, xx_data, xx_data + D, xx_data + D); @@ -336,8 +340,8 @@ class FuisonLSTMKernel : public framework::OpKernel { blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); // prev - prev_hidden_data = hidden_out_data; - prev_cell_data = cell_out_data; + prev_h_data = hidden_out_data; + prev_c_data = cell_out_data; move_step(); } @@ -346,143 +350,155 @@ class FuisonLSTMKernel : public framework::OpKernel { void BatchCompute(const framework::ExecutionContext& ctx) const { using DeviceContext = platform::CPUDeviceContext; - auto* x = ctx.Input("X"); - auto* wx = ctx.Input("WeightX"); - auto* wh = ctx.Input("WeightH"); - auto* bias = ctx.Input("Bias"); - auto* hidden_t0 = ctx.Input("H0"); - auto* cell_t0 = ctx.Input("C0"); - - auto* xx = ctx.Output("XX"); - auto* batched_gate = ctx.Output("BatchedGate"); - auto* hidden_out = ctx.Output("Hidden"); - auto* cell_out = ctx.Output("Cell"); - bool is_reverse = ctx.Attr("is_reverse"); + INIT_BASE_INPUT_OUTPUT + if (x->lod()[0].size() == 2) { + SeqCompute(ctx); + return; + } + INIT_BASE_SIZES + INIT_VEC_FUNC - T* xx_data = xx->mutable_data(ctx.GetPlace()); - T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); - hidden_out->mutable_data(ctx.GetPlace()); - cell_out->mutable_data(ctx.GetPlace()); + auto* reordered_h0 = ctx.Output("ReorderedH0"); + auto* reordered_c0 = ctx.Output("ReorderedC0"); + auto* batched_input = ctx.Output("BatchedInput"); + auto* batched_c_out = ctx.Output("BatchedCell"); + auto* batched_h_out = ctx.Output("BatchedHidden"); const T* x_data = x->data(); const T* wx_data = wx->data(); - auto x_dims = x->dims(); - auto wx_dims = wx->dims(); + const T* wh_data = wh->data(); + auto place = ctx.GetPlace(); + T* xx_data = xx->mutable_data(place); + T* batched_input_data = batched_input->mutable_data(place); + T* batched_c_out_data = batched_c_out->mutable_data(place); + T* batched_h_out_data = batched_h_out->mutable_data(place); + hidden_out->mutable_data(place); + cell_out->mutable_data(place); math::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); - if (x_dims[1] > wx_dims[1]) { - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - x_data, wx_data, xx_data, - bias->data()); - to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + if (M > D4) { + math::FCCompute(blas, x_dims[0], D4, M, x_data, wx_data, + xx_data, bias->data()); + to_batch(dev_ctx, *xx, batched_input, true, is_reverse); } else { to_batch(dev_ctx, *x, xx, true, is_reverse); - batched_gate->set_lod(xx->lod()); - math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], - xx_data, wx_data, batched_gate_data, + batched_input->set_lod(xx->lod()); + math::FCCompute(blas, x_dims[0], D4, M, xx_data, + wx_data, batched_input_data, bias->data()); } - int frame_size = static_cast(wx_dims[1] / 4); - framework::DDim out_dims({x_dims[0], frame_size}); - math::LstmMetaValue lstm_value; - // no peephole - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - lstm_value.prev_state_value = nullptr; - Tensor ordered_c0; - - framework::Vector order(batched_gate->lod()[2]); - - if (cell_t0) { - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, - true); - lstm_value.prev_state_value = ordered_c0.data(); + auto batched_lod = batched_input->lod(); + const auto& seq_order = batched_lod[2]; + const int max_bs = seq_order.size(); + reordered_h0->Resize({max_bs, D}); + reordered_c0->Resize({max_bs, D}); + + int tstart = 0; + T* prev_h_data = nullptr; + T* prev_c_data = nullptr; + if (h0) { + // reorder h0, c0 + T* reordered_h0_data = reordered_h0->mutable_data(place); + T* reordered_c0_data = reordered_c0->mutable_data(place); + const T* h0_data = h0->data(); + const T* c0_data = c0->data(); + prev_h_data = reordered_h0_data; + prev_c_data = reordered_c0_data; + size_t sz = sizeof(T) * D; + for (int i = 0; i < max_bs; ++i) { + std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); + std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + reordered_h0_data += D; + reordered_c0_data += D; + } + } else { + // compute without h0, c0 + T* cur_in_data = batched_input_data; + T* cur_h_out_data = batched_h_out_data; + T* cur_c_out_data = batched_c_out_data; + // W_ch, W_ih, W_fh, W_oh + for (int i = 0; i < max_bs; ++i) { + act_gate(D3, cur_in_data + D, cur_in_data + D); + act_cand(D, cur_in_data, cur_in_data); + // cell out= input*tilde + blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data); + // hidden out= act_state(cellout) * outgate + act_cell(D, cur_c_out_data, cur_in_data + D2); + blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); + + // add offset + cur_in_data += D4; + cur_c_out_data += D; + cur_h_out_data += D; + } + tstart = 1; + prev_h_data = batched_h_out_data; + prev_c_data = batched_c_out_data; } + // Then start from next + const auto& batch_starts = batched_lod[0]; + const int max_seq_len = batch_starts.size() - 1; + const int offset = tstart * max_bs * D; + batched_input_data = batched_input_data + offset * 4; + batched_h_out_data = batched_h_out_data + offset; + batched_c_out_data = batched_c_out_data + offset; + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast(1), + prev_h_data, D, wh_data, D4, static_cast(1), + batched_input_data, D4); + + T* cur_in_data = batched_input_data; + T* cur_prev_c_data = prev_c_data; + T* cur_c_out_data = batched_c_out_data; + T* cur_h_out_data = batched_h_out_data; + for (int i = 0; i < cur_bs; ++i) { + // W_ch, W_ih, W_fh, W_oh + act_gate(D3, cur_in_data + D, cur_in_data + D); + act_cand(D, cur_in_data, cur_in_data); + // a = forget * prev_cell + blas.VMUL(D, cur_in_data + D2, cur_prev_c_data, cur_in_data + D2); + // b = input * tilde + blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D); + // cell out= a+b + blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data); + // hidden out= act_state(cellout) * outgate + act_cell(D, cur_c_out_data, cur_in_data + D2); + blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data); - // Use the local variable as here. - LoDTensor batch_hidden, batch_cell; - auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); - batch_hidden.mutable_data(out_dims, ctx.GetPlace()); - batch_cell.mutable_data(out_dims, ctx.GetPlace()); - batch_cell_pre_act->mutable_data(out_dims, ctx.GetPlace()); - - auto batch_starts = batched_gate->lod()[0]; - size_t max_seq_len = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( - ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( - ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( - ctx.Attr("candidate_activation")); - - for (size_t n = 0; n < max_seq_len; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - Tensor gate_t = batched_gate->Slice(bstart, bend); - Tensor out_t = batch_hidden.Slice(bstart, bend); - Tensor cell_t = batch_cell.Slice(bstart, bend); - Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); - - int cur_batch_size = bend - bstart; - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - // TODO(TJ): use gemm directly - blas.MatMul(pre_hidden_t, false, *wh, false, static_cast(1.0), - &gate_t, static_cast(1.0)); - } else if (hidden_t0) { - // TODO(TJ): move h0 outside for - // If n == 0 and there is no initialized hidden state, that is to say - // the H0 is zeros, the calculation W_h * H0 will be skiped. - // If n == 0 and there is initialized hidden state, calculate W_h * H0. - - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized hidden state also needs - // to reorder. - Tensor ordered_h0; - ReorderInitState(dev_ctx, *hidden_t0, order, - &ordered_h0, true); - // TODO(TJ): use gemm directly - blas.MatMul(ordered_h0, false, *wh, false, static_cast(1.0), &gate_t, - static_cast(1.0)); + cur_in_data += D4; + cur_prev_c_data += D; + cur_c_out_data += D; + cur_h_out_data += D; } - lstm_value.gate_value = gate_t.data(); - lstm_value.output_value = out_t.data(); - lstm_value.state_value = cell_t.data(); - lstm_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute( - dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, - cand_act); - lstm_value.prev_state_value = lstm_value.state_value; + prev_c_data = batched_c_out_data; + prev_h_data = batched_h_out_data; + batched_c_out_data = cur_c_out_data; + batched_h_out_data = cur_h_out_data; + batched_input_data = cur_in_data; } math::Batch2LoDTensorFunctor to_seq; - batch_hidden.set_lod(batched_gate->lod()); - // restore the output hidden in LoDTensor from the batch hidden - to_seq(dev_ctx, batch_hidden, hidden_out); - - batch_cell.set_lod(batched_gate->lod()); - // restore the output cell state in LoDTensor from the batch cell - to_seq(dev_ctx, batch_cell, cell_out); + batched_h_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_h_out, hidden_out); + batched_c_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_c_out, cell_out); } + void Compute(const framework::ExecutionContext& ctx) const override { - if (FLAGS_seq_mode) { + if (ctx.Attr("use_seq")) { SeqCompute(ctx); } else { BatchCompute(ctx); } } +#undef INIT_BASE_SIZES +#undef INIT_BASE_INPUT_OUTPUT +#undef INIT_VEC_FUNC }; } // namespace operators diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h index 2d9faed648aef78da60706e13db3862080c96514..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644 --- a/paddle/fluid/operators/gru_unit_op.h +++ b/paddle/fluid/operators/gru_unit_op.h @@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel { gate_data, frame_size * 3); // calculate activited gate - Eigen::array extents({{batch_size, frame_size}}); - Eigen::array u_offsets({{0, 0}}); + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; ActCompute(context.Attr("gate_activation"), place, g.slice(u_offsets, extents), g.slice(u_offsets, extents)); auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets({{0, frame_size}}); + Eigen::array r_offsets{{0, frame_size}}; ActCompute(context.Attr("gate_activation"), place, g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate @@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel { weight_data + frame_size * frame_size * 2, frame_size, 1, gate_data + frame_size * 2, frame_size * 3); - Eigen::array c_offsets({{0, frame_size * 2}}); + Eigen::array c_offsets{{0, frame_size * 2}}; ActCompute(context.Attr("activation"), place, g.slice(c_offsets, extents), g.slice(c_offsets, extents)); auto c = g.slice(c_offsets, extents); // output candidate @@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel { int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; - Eigen::array extents({{batch_size, frame_size}}); - Eigen::array u_offsets({{0, 0}}); + Eigen::array extents{{batch_size, frame_size}}; + Eigen::array u_offsets{{0, 0}}; auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets({{0, frame_size}}); + Eigen::array r_offsets{{0, frame_size}}; auto r = g.slice(r_offsets, extents); // reset gate - Eigen::array c_offsets({{0, frame_size * 2}}); + Eigen::array c_offsets{{0, frame_size * 2}}; auto c = g.slice(c_offsets, extents); // output candidate // backward for unactivated update gate diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h index f56fd95e96526c59e040fbbd2812360e59570a08..f3da17de011053fa118b5a4257bb5c3b00084741 100644 --- a/paddle/fluid/operators/label_smooth_op.h +++ b/paddle/fluid/operators/label_smooth_op.h @@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel { auto dist = framework::EigenVector::Flatten(*dist_t); out.device(dev) = static_cast(1 - epsilon) * in + - epsilon * dist.broadcast(Eigen::DSizes(in_t->numel())); + static_cast(epsilon) * + dist.broadcast(Eigen::DSizes(in_t->numel())); } else { out.device(dev) = static_cast(1 - epsilon) * in + static_cast(epsilon / label_dim); diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 5693761e9ffd96b40040223b5498b63b0274bf0f..9560e3a3c15ca63892fbe3552679a22f027f11e2 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -132,6 +132,121 @@ inline void vec_scal(const int n, vec_scal(n, a, x, y); } +template +inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = a - x[i]; + } +} + +template <> +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_bias_sub(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_sub_ps(bias, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = a - x[i]; + } +#else + vec_bias_sub(n, a, x, y); +#endif +} + +template <> +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); +} + +template <> +inline void vec_bias_sub(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_bias_sub(n, a, x, y); +} + +// out = x*y + (1-x)*z +template +inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { + for (int i = 0; i < n; ++i) { + out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; + } +} + +template <> +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_cross(n, x, y, z, out); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(1.f); + __m256 tmpx, tmpy, tmpz; + for (i = 0; i < end; i += block) { + tmpx = _mm256_loadu_ps(x + i); + tmpy = _mm256_loadu_ps(y + i); + tmpz = _mm256_loadu_ps(z + i); + tmpy = _mm256_mul_ps(tmpx, tmpy); + tmpx = _mm256_sub_ps(bias, tmpx); + tmpz = _mm256_mul_ps(tmpx, tmpz); + tmpz = _mm256_add_ps(tmpy, tmpz); + _mm256_storeu_ps(out + i, tmpz); + } + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; + } +#else + vec_cross(n, x, y, z, out); +#endif +} + +template <> +inline void vec_cross(const int n, const float* x, + const float* y, + const float* z, float* out) { + vec_cross(n, x, y, z, out); +} + +template <> +inline void vec_cross( + const int n, const float* x, const float* y, const float* z, float* out) { + // TODO(TJ): enable me + vec_cross(n, x, y, z, out); +} + template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 5454d58f371afb5f5d6a1c3208318f80d4e0aa36..07854c83584f90db02b416b85a4aa61f5cdc0685 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -17,6 +17,11 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#if defined(_WIN32) +#include +#include +#endif // _WIN32 + namespace paddle { namespace operators { namespace math { @@ -55,12 +60,38 @@ namespace math { * FindLastSet(x) = 1 + \floor*{\log_{2}x} * \f] */ +#if !defined(_WIN32) inline constexpr size_t FindLastSet(size_t x) { return std::is_same::value ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0) : (std::is_same::value // NOLINT ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); + +#else +// windows don't have built-in clz, ctz function +template +inline int ctz(const T& value) { + DWORD trailing_zero = 0; + if (_BitScanForward(&trailing_zero, value)) { + return static_cast(trailing_zero); + } else { + return static_cast(0); + } +} + +template +inline int clz(const T& value) { + DWORD leadning_zero = 0; + if (_BitScanReverse(&leadning_zero, value)) { + return static_cast(sizeof(T) * 8 - leadning_zero); + } else { + return static_cast(0); + } +} + +inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } +#endif // !_WIN32 } struct SimpleCode { diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 4166fb54946b7082f5f7dc0e232f636a1d2f8a13..e4d378dc23210e95605c6e09eda8a190cc5c6b4f 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -16,13 +16,12 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { namespace math { -#define FLT_MAX __FLT_MAX__ - template class MaxOutFunctor { public: diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 2538d739cce95d1b2fc5b3f905af5e6d94cf7af5..120f5919803806e0d3b7dc8eaf530ae89819b84d 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -18,15 +18,12 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { namespace math { -#define FLT_MAX \ - __FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm - // still wondering where to put it. - /* * \brief Extracting simple operations from pooling. * Both MaxPool and AvgPool need "initial", "compute" and "finalize" diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h index 07372235a7c23832e528c3e852a4747f4244b833..a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb 100644 --- a/paddle/fluid/operators/math/sequence2batch.h +++ b/paddle/fluid/operators/math/sequence2batch.h @@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor { // Calculate the start position of each batch. // example: sequences = {s0, s1, s2} // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 - // num_batch = 5, + // max_seqlen = 5, // batchIndex = {b0, b1, b2, b3, b4} // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} @@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor { // where 1 is the second sequence, // 0 is the first sequence, // 2 is the third sequence. - // The num_batch represents batch size after rearranging the + // The max_seqlen represents batch size after rearranging the // input LodTensor. It is also the maximum length of input sequence. paddle::framework::LoD batch_lods; @@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor { batch_lods.emplace_back(std::vector{0}); // batch_lods[0] is the start positions for batch LoDTensor - int num_batch = seq_info[0].length; - batch_lods[0].resize(static_cast(num_batch + 1)); + int max_seqlen = seq_info[0].length; + batch_lods[0].resize(static_cast(max_seqlen + 1)); // batch_lods[1] is the raw index in the input LoDTensor batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); // batch_lods[2] is the sort order for the input LoDTensor. @@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor { size_t* batch_starts = batch_lods[0].data(); size_t* seq2batch_idx = batch_lods[1].data(); batch_starts[0] = 0; - for (int n = 0; n < num_batch; n++) { + for (int n = 0; n < max_seqlen; n++) { auto batch_id = static_cast(batch_starts[n]); for (size_t i = 0; i < seq_info.size(); ++i) { int seq_len = seq_info[i].length; diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index 50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e..46e20285db6d7acd39dead3994409645adddf494 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) { template __global__ void GPUROIPoolForward( - const int nthreads, const T* input_data, const int64_t* input_rois, + const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { @@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward( int c = (i / pooled_width / pooled_height) % channels; int n = i / pooled_width / pooled_height / channels; - const int64_t* offset_input_rois = input_rois + n * kROISize; + const T* offset_input_rois = input_rois + n * kROISize; int roi_batch_ind = roi_batch_id_data[n]; int roi_start_w = round(offset_input_rois[0] * spatial_scale); int roi_start_h = round(offset_input_rois[1] * spatial_scale); @@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward( template __global__ void GPUROIPoolBackward( - const int nthreads, const int64_t* input_rois, const T* output_grad, + const int nthreads, const T* input_rois, const T* output_grad, const int64_t* argmax_data, const int num_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, int* roi_batch_id_data, @@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel { GPUROIPoolForward< T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - channels, height, width, pooled_height, pooled_width, + output_size, in->data(), rois->data(), spatial_scale, channels, + height, width, pooled_height, pooled_width, roi_batch_id_list_gpu.data(), out->mutable_data(ctx.GetPlace()), argmax->mutable_data(ctx.GetPlace())); } @@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (output_grad_size > 0) { GPUROIPoolBackward< T><<>>( - output_grad_size, rois->data(), out_grad->data(), + output_grad_size, rois->data(), out_grad->data(), argmax->data(), rois_num, spatial_scale, channels, height, width, pooled_height, pooled_width, roi_batch_id_list_gpu.data(), diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h index c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f..07de7c9f0e070cef7c6f38f8d564ab76910842db 100644 --- a/paddle/fluid/operators/roi_pool_op.h +++ b/paddle/fluid/operators/roi_pool_op.h @@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel { T* output_data = out->mutable_data(ctx.GetPlace()); int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); - const int64_t* rois_data = rois->data(); + const T* rois_data = rois->data(); for (int n = 0; n < rois_num; ++n) { int roi_batch_id = roi_batch_id_data[n]; int roi_start_w = round(rois_data[0] * spatial_scale); @@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { } } - const int64_t* rois_data = rois->data(); + const T* rois_data = rois->data(); const T* out_grad_data = out_grad->data(); const int64_t* argmax_data = argmax->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index cfee9207083b46f7c27354f22e82a7d3c38a027c..5b05f757c0355ed15617dea925b5d4929fcbfee0 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include #include #include @@ -23,40 +22,11 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { -// TODO(sidgoyal78): These function are needed by other files (save_op), move -// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op). -constexpr char kSEP = '/'; -static bool FileExists(const std::string &filepath) { - struct stat buffer; - return (stat(filepath.c_str(), &buffer) == 0); -} - -static std::string DirName(const std::string &filepath) { - auto pos = filepath.rfind(kSEP); - if (pos == std::string::npos) { - return ""; - } - return filepath.substr(0, pos); -} - -static void MkDir(const char *path) { - if (mkdir(path, 0755)) { - PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); - } -} - -static void MkDirRecursively(const char *fullpath) { - if (*fullpath == '\0') return; // empty string - if (FileExists(fullpath)) return; - - MkDirRecursively(DirName(fullpath).c_str()); - MkDir(fullpath); -} - class SaveCombineOp : public framework::OperatorBase { public: SaveCombineOp(const std::string &type, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 85de37416b5f24128ee98320a872eafffe967c81..e79cffcf498c52ed14db235f6221cfdf08399c9d 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include #include @@ -25,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -33,36 +33,6 @@ namespace operators { // to directory specified. constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; -// TODO(yuyang18): If the functions below are needed by other files, move them -// to paddle::filesystem namespace. -constexpr char kSEP = '/'; -static bool FileExists(const std::string &filepath) { - struct stat buffer; - return (stat(filepath.c_str(), &buffer) == 0); -} - -static std::string DirName(const std::string &filepath) { - auto pos = filepath.rfind(kSEP); - if (pos == std::string::npos) { - return ""; - } - return filepath.substr(0, pos); -} - -static void MkDir(const char *path) { - if (mkdir(path, 0755)) { - PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path); - } -} - -static void MkDirRecursively(const char *fullpath) { - if (*fullpath == '\0') return; // empty string - if (FileExists(fullpath)) return; - - MkDirRecursively(DirName(fullpath).c_str()); - MkDir(fullpath); -} - class SaveOp : public framework::OperatorBase { public: SaveOp(const std::string &type, const framework::VariableNameMap &inputs, diff --git a/paddle/fluid/operators/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_enumerate_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..58e48c228bb34814700fd0f7a3d62ef4b1a435dd --- /dev/null +++ b/paddle/fluid/operators/sequence_enumerate_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_enumerate_op.h" + +namespace paddle { +namespace operators { + +class SequenceEnumerateOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("X"), + "Input(X) of SequecceEnumerate operator should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(X) of SequenceEnumerate operator should not be null."); + + const auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2UL, + "Input(X) of SequenceEnumerate operator's rank should be 2."); + PADDLE_ENFORCE_EQ( + x_dims[1], 1UL, + "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1."); + + const auto win_size = ctx->Attrs().Get("win_size"); + ctx->SetOutputDim("Out", {x_dims[0], win_size}); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(2-D LoDTensor with the 2nd dimension equal to 1) " + "Input LoDTensor of SequenceEnumerate operator."); + AddOutput("Out", + "(2-D LoDTensor with the 2nd dimension equal to win_size) " + "Output LoDTensor of SequenceEnumerate operator."); + AddAttr("win_size", "(int) The enumerate sequence window size.") + .AddCustomChecker([](const int& win_size) { + PADDLE_ENFORCE(win_size >= 2, + "The window size should be not less than 2."); + }); + AddAttr("pad_value", "(int) The enumerate sequence padding value.") + .SetDefault(0); + AddComment(R"DOC( +Sequence Enumerate Operator. + +Generate a new sequence for the input index sequence, which enumerates all the +sub-sequences with length `win_size` of the input. +The enumerated sequence has the same 1st dimension with variable `input`, and +the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. + +Examples: +Case 1: + Input: + X.lod = [[0, 3, 5]] + X.data = [[1], [2], [3], [4], [5]] + X.dims = [5, 1] + Attrs: + win_size = 2 + pad_value = 0 + Output: + Out.lod = [[0, 3, 5]] + Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] + Out.dims = [5, 2] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp, + ops::SequenceEnumerateOpMaker); +REGISTER_OP_CPU_KERNEL( + sequence_enumerate, + ops::SequenceEnumerateKernel, + ops::SequenceEnumerateKernel); diff --git a/paddle/fluid/operators/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_enumerate_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bdc9a615aa9a1ecd99c1f6995361f8c5ff0aa383 --- /dev/null +++ b/paddle/fluid/operators/sequence_enumerate_op.cu @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/operators/sequence_enumerate_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; +using LoDTensor = framework::LoDTensor; + +template +__global__ void CalcOutPut(const T* in_data, const size_t* in_lod, + const size_t lod_len, const int64_t win_size, + const int64_t pad_value, T* out_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < in_lod[lod_len - 1]) { + int end_idx = 0; + // Get LoD interval of index + for (int i = 1; i < lod_len; ++i) { + if (index < in_lod[i]) { + end_idx = in_lod[i]; + break; + } + } + for (size_t i = 0; i < win_size; ++i) { + int word_pos = index + i; + out_data[index * win_size + i] = + word_pos < end_idx ? in_data[word_pos] : pad_value; + } + } +} + +template +class SequenceEnumerateOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int win_size = context.Attr("win_size"); + int pad_value = context.Attr("pad_value"); + + auto in_dims = in->dims(); + auto in_lod = in->lod(); + + PADDLE_ENFORCE_EQ( + static_cast(in_dims[0]), in_lod[0].back(), + "The actual input data's size mismatched with LoD information."); + + /* Generate enumerate sequence set */ + auto stream = context.cuda_device_context().stream(); + auto lod0 = in_lod[0]; + auto in_len = in->numel(); + auto in_data = in->data(); + auto out_data = out->mutable_data(context.GetPlace()); + // Copy LoD to GPU + const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace()); + // Calc output tensor + CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL( + sequence_enumerate, + paddle::operators::SequenceEnumerateOpCUDAKernel, + paddle::operators::SequenceEnumerateOpCUDAKernel); diff --git a/paddle/fluid/operators/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_enumerate_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dc18d9b2071303377505155476b87ed029eaf986 --- /dev/null +++ b/paddle/fluid/operators/sequence_enumerate_op.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using LoDTensor = framework::LoDTensor; + +template +class SequenceEnumerateKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int win_size = context.Attr("win_size"); + int pad_value = context.Attr("pad_value"); + + auto in_dims = in->dims(); + auto in_lod = in->lod(); + + PADDLE_ENFORCE_EQ( + static_cast(in_dims[0]), in_lod[0].back(), + "The actual input data's size mismatched with LoD information."); + + // Generate enumerate sequence set + auto lod0 = in_lod[0]; + auto in_data = in->data(); + auto out_data = out->mutable_data(context.GetPlace()); + for (size_t i = 0; i < lod0.size() - 1; ++i) { + for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) { + for (int word_idx = 0; word_idx < win_size; ++word_idx) { + size_t word_pos = idx + word_idx; + out_data[win_size * idx + word_idx] = + word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h index 4cc04b090519637ab0b8d3740b8a12f216218cae..32b7efc04c1f2ecc22f93c08387aec69ded4930a 100644 --- a/paddle/fluid/platform/macros.h +++ b/paddle/fluid/platform/macros.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include // Disable the copy and assignment operator for a class. #ifndef DISABLE_COPY_AND_ASSIGN @@ -23,3 +24,7 @@ limitations under the License. */ classname& operator=(const classname&) = delete; \ classname& operator=(classname&&) = delete #endif + +#if defined(__FLT_MAX__) +#define FLT_MAX __FLT_MAX__ +#endif // __FLT_MAX__ diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index a0a2d29500e7afbe8a9a43f010d5fd2d0c560467..cf9f4aa95bc1cb79d95b79331fbc09e11af64194 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -14,24 +14,141 @@ #pragma once +#include #include + +#include #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" + #if !defined(_WIN32) -#include // for dladdr -#include // for backtrace +#define UNUSED __attribute__((unused)) +#include // dladdr +#include // backtrace +#include +#include // std::accumulate #else -#include -#include +#include // _popen, _pclose +#include +#if defined(_WIN32) +#include // std::accumulate in msvc +#endif +// windows version of __attribute__((unused)) +#define UNUSED __pragma(warning(suppress : 4100)) -static void* dlsym(void* handle, const char* symbol_name) { +#ifndef S_ISDIR // windows port for sys/stat.h +#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) +#endif // S_ISDIR + +static void *dlsym(void *handle, const char *symbol_name) { FARPROC found_symbol; found_symbol = GetProcAddress((HMODULE)handle, symbol_name); if (found_symbol == NULL) { throw std::runtime_error(std::string(symbol_name) + " not found."); } - return reinterpret_cast(found_symbol); + return reinterpret_cast(found_symbol); } -#endif +static void *dlopen(const char *filename, int flag) { + std::string file_name(filename); + file_name.replace(0, file_name.size() - 1, '/', '\\'); + HMODULE hModule = LoadLibrary(file_name.c_str()); + if (!hModule) { + throw std::runtime_error(file_name + " not found."); + } + return reinterpret_cast(hModule); +} + +#endif // !_WIN32 + +static void ExecShellCommand(const std::string &cmd, std::string *message) { + char buffer[128]; +#if !defined(_WIN32) + std::shared_ptr pipe(popen(cmd.c_str(), "r"), pclose); +#else + std::shared_ptr pipe(_popen(cmd.c_str(), "r"), _pclose); +#endif // _WIN32 + if (!pipe) { + LOG(ERROR) << "error running command: " << cmd; + return; + } + while (!feof(pipe.get())) { + if (fgets(buffer, 128, pipe.get()) != nullptr) { + *message += buffer; + } + } +} + +static bool PathExists(const std::string &path) { +#if !defined(_WIN32) + struct stat statbuf; + if (stat(path.c_str(), &statbuf) != -1) { + if (S_ISDIR(statbuf.st_mode)) { + return true; + } + } +#else + struct _stat statbuf; + if (_stat(path.c_str(), &statbuf) != -1) { + if (S_ISDIR(statbuf.st_mode)) { + return true; + } + } +#endif // !_WIN32 + return false; +} + +// TODO(yuyang18): If the functions below are needed by other files, move them +// to paddle::filesystem namespace. +#if !defined(_WIN32) +constexpr char kSEP = '/'; +#else +constexpr char kSEP = '\\'; +#endif // _WIN32 + +static bool FileExists(const std::string &filepath) { +#if !defined(_WIN32) + struct stat buffer; + return (stat(filepath.c_str(), &buffer) == 0); +#else + struct _stat buffer; + return (_stat(filepath.c_str(), &buffer) == 0); +#endif // !_WIN32 +} + +static std::string DirName(const std::string &filepath) { + auto pos = filepath.rfind(kSEP); + if (pos == std::string::npos) { + return ""; + } + return filepath.substr(0, pos); +} + +static void MkDir(const char *path) { + std::string path_error(path); + path_error += " mkdir failed!"; +#if !defined(_WIN32) + if (mkdir(path, 0755)) { + if (errno != EEXIST) { + throw std::runtime_error(path_error); + } + } +#else + CreateDirectory(path, NULL); + auto errorno = GetLastError(); + if (errorno != ERROR_ALREADY_EXISTS) { + throw std::runtime_error(path_error); + } +#endif // !_WIN32 +} + +static void MkDirRecursively(const char *fullpath) { + if (*fullpath == '\0') return; // empty string + if (FileExists(fullpath)) return; + + MkDirRecursively(DirName(fullpath).c_str()); + MkDir(fullpath); +} diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index 3d2ef566173f81b29a6d8ea79cff00991a4ef3c4..a9b94a20720615dbfca97749463f27dbc88ac64f 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -98,10 +98,9 @@ class Inferencer(object): raise ValueError( "inputs should be a map of {'input_name': input_var}") - with executor.scope_guard(self.scope): - results = self.exe.run(self.inference_program, - feed=inputs, - fetch_list=[self.predict_var], + with self._prog_and_scope_guard(): + results = self.exe.run(feed=inputs, + fetch_list=[self.predict_var.name], return_numpy=return_numpy) return results diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 5757b2798e43dc70b406462a74b4f74eedcf56fa..1bc1dbbecaccd328d84cd3364a50c8f828d823c0 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -145,26 +145,23 @@ def rpn_target_assign(loc, """ helper = LayerHelper('rpn_target_assign', **locals()) - # 1. Compute the regression target bboxes - target_bbox = box_coder( - prior_box=anchor_box, - prior_box_var=anchor_var, - target_box=gt_box, - code_type='encode_center_size', - box_normalized=False) - # 2. Compute overlaps between the prior boxes and the gt boxes overlaps + # Compute overlaps between the prior boxes and the gt boxes overlaps iou = iou_similarity(x=gt_box, y=anchor_box) - # 3. Assign target label to anchors - loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) - score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) - target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) + # Assign target label to anchors + loc_index = helper.create_tmp_variable(dtype='int32') + score_index = helper.create_tmp_variable(dtype='int32') + target_label = helper.create_tmp_variable(dtype='int64') + target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'DistMat': iou}, + inputs={'Anchor': anchor_box, + 'GtBox': gt_box, + 'DistMat': iou}, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, - 'TargetLabel': target_label + 'TargetLabel': target_label, + 'TargetBBox': target_bbox, }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, @@ -173,16 +170,16 @@ def rpn_target_assign(loc, 'fg_fraction': fg_fraction }) - # 4. Reshape and gather the target entry - scores = nn.reshape(x=scores, shape=(-1, 2)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - target_label = nn.reshape(x=target_label, shape=(-1, 1)) - target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) + loc_index.stop_gradient = True + score_index.stop_gradient = True + target_label.stop_gradient = True + target_bbox.stop_gradient = True + scores = nn.reshape(x=scores, shape=(-1, 1)) + loc = nn.reshape(x=loc, shape=(-1, 4)) predicted_scores = nn.gather(scores, score_index) predicted_location = nn.gather(loc, loc_index) - target_label = nn.gather(target_label, score_index) - target_bbox = nn.gather(target_bbox, loc_index) + return predicted_scores, predicted_location, target_label, target_bbox diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0ecfc958a3b89c85ef00574d630042d410c3fa0a..a0d92fd1462acb18cdb2463b51138c9ff33b08a8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -111,6 +111,7 @@ __all__ = [ 'stack', 'pad2d', 'unstack', + 'sequence_enumerate', ] @@ -5823,6 +5824,51 @@ def flatten(x, axis=1, name=None): return out +def sequence_enumerate(input, win_size, pad_value=0, name=None): + """ + Generate a new sequence for the input index sequence, which enumerates all the + sub-sequences with length `win_size` of the input. + The enumerated sequence has the same 1st dimension with variable `input`, and + the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. + + Examples: + Case 1: + Input: + X.lod = [[0, 3, 5]] + X.data = [[1], [2], [3], [4], [5]] + X.dims = [5, 1] + Attrs: + win_size = 2 + pad_value = 0 + Output: + Out.lod = [[0, 3, 5]] + Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] + Out.dims = [5, 2] + + Args: + input (Variable): The input variable which is a index sequence. + win_size (int): The window size for enumerating all sub-sequences. + pad_value (int): The padding value, default 0. + + Returns: + Variable: The enumerate sequence variable which is a LoDTensor. + + Examples: + .. code-block:: python + + x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1) + out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0) + """ + helper = LayerHelper('sequence_enumerate', **locals()) + out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True) + helper.append_op( + type='sequence_enumerate', + inputs={'X': input}, + outputs={'Out': out}, + attrs={'win_size': win_size, + 'pad_value': pad_value}) + + def sequence_mask(x, maxlen=None, dtype='int64', name=None): """ **SequenceMask Layer** @@ -5902,6 +5948,7 @@ def stack(x, axis=0): helper.append_op( type='stack', inputs={'X': x}, outputs={'Y': out}, attrs={'axis': axis}) + return out diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index be494a0d340c62fb35afbf97fba38eff08a965e6..2e15c224f662171bf0fee228bdd9d36189fbe499 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -16,7 +16,9 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import paddle.fluid.core as core import numpy +import os import cifar10_small_test_set @@ -89,7 +91,7 @@ def optimizer_func(): return fluid.optimizer.Adam(learning_rate=0.001) -def train(use_cuda, train_program, params_dirname): +def train(use_cuda, train_program, parallel, params_dirname): BATCH_SIZE = 128 EPOCH_NUM = 1 @@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, optimizer_func=optimizer_func, place=place) + train_func=train_program, + optimizer_func=optimizer_func, + place=place, + parallel=parallel) trainer.train( reader=train_reader, @@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname): feed_order=['pixel', 'label']) -def infer(use_cuda, inference_program, params_dirname=None): +def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) + infer_func=inference_program, + param_path=params_dirname, + place=place, + parallel=parallel) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range @@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None): print("infer results: ", results) -def main(use_cuda): +def main(use_cuda, parallel): if use_cuda and not fluid.core.is_compiled_with_cuda(): return save_path = "image_classification_resnet.inference.model" + os.environ['CPU_NUM'] = str(4) train( use_cuda=use_cuda, train_program=train_network, - params_dirname=save_path) + params_dirname=save_path, + parallel=parallel) + # FIXME(zcd): in the inference stage, the number of + # input data is one, it is not appropriate to use parallel. + if parallel and use_cuda: + return + + os.environ['CPU_NUM'] = str(1) infer( use_cuda=use_cuda, inference_program=inference_network, - params_dirname=save_path) + params_dirname=save_path, + parallel=parallel) if __name__ == '__main__': for use_cuda in (False, True): - main(use_cuda=use_cuda) + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index dbc7bc06c93157f271c79e85b6925468e861e57f..2f205de1c011cd714439d4896adc8862ce68d99b 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -16,7 +16,9 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import paddle.fluid.core as core import numpy +import os import cifar10_small_test_set @@ -68,7 +70,7 @@ def optimizer_func(): return fluid.optimizer.Adam(learning_rate=0.001) -def train(use_cuda, train_program, params_dirname): +def train(use_cuda, train_program, parallel, params_dirname): BATCH_SIZE = 128 train_reader = paddle.batch( paddle.reader.shuffle( @@ -93,7 +95,10 @@ def train(use_cuda, train_program, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer_func=optimizer_func) + train_func=train_program, + place=place, + optimizer_func=optimizer_func, + parallel=parallel) trainer.train( reader=train_reader, @@ -102,10 +107,13 @@ def train(use_cuda, train_program, params_dirname): feed_order=['pixel', 'label']) -def infer(use_cuda, inference_program, params_dirname=None): +def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) + infer_func=inference_program, + param_path=params_dirname, + place=place, + parallel=parallel) # The input's dimension of conv should be 4-D or 5-D. # Use normilized image pixels as input data, which should be in the range @@ -116,22 +124,31 @@ def infer(use_cuda, inference_program, params_dirname=None): print("infer results: ", results) -def main(use_cuda): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return +def main(use_cuda, parallel): save_path = "image_classification_vgg.inference.model" + os.environ['CPU_NUM'] = str(4) train( use_cuda=use_cuda, train_program=train_network, - params_dirname=save_path) + params_dirname=save_path, + parallel=parallel) + # FIXME(zcd): in the inference stage, the number of + # input data is one, it is not appropriate to use parallel. + if parallel and use_cuda: + return + os.environ['CPU_NUM'] = str(1) infer( use_cuda=use_cuda, inference_program=inference_network, - params_dirname=save_path) + params_dirname=save_path, + parallel=parallel) if __name__ == '__main__': for use_cuda in (False, True): - main(use_cuda=use_cuda) + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index 187bef1b0c1a614fbca88ef22097831d7bd5cd7f..a5adf68158526b628deba3fc7ca6856eb7c9cded 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -64,14 +64,14 @@ def optimizer_func(): return fluid.optimizer.Adam(learning_rate=0.001) -def train(use_cuda, train_program, params_dirname): +def train(use_cuda, train_program, parallel, params_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( train_func=train_program, place=place, optimizer_func=optimizer_func, - parallel=True) + parallel=parallel) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): @@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname): feed_order=['img', 'label']) -def infer(use_cuda, inference_program, params_dirname=None): +def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) + infer_func=inference_program, + param_path=params_dirname, + place=place, + parallel=parallel) batch_size = 1 tensor_img = numpy.random.uniform(-1.0, 1.0, @@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None): print("infer results: ", results[0]) -def main(use_cuda): +def main(use_cuda, parallel): params_dirname = "recognize_digits_conv.inference.model" # call train() with is_local argument to run distributed train + os.environ['CPU_NUM'] = str(4) train( use_cuda=use_cuda, train_program=train_program, - params_dirname=params_dirname) + params_dirname=params_dirname, + parallel=parallel) + + # FIXME(zcd): in the inference stage, the number of + # input data is one, it is not appropriate to use parallel. + if parallel and use_cuda: + return + os.environ['CPU_NUM'] = str(1) infer( use_cuda=use_cuda, inference_program=inference_program, - params_dirname=params_dirname) + params_dirname=params_dirname, + parallel=parallel) if __name__ == '__main__': - # for use_cuda in (False, True): - main(use_cuda=core.is_compiled_with_cuda()) + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index b95e7db122adbb1414da1691926c920b963fd6fe..e7d8b23b3253d368210c08be4e53c06ba0c5d618 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -16,6 +16,7 @@ from __future__ import print_function import argparse import paddle.fluid as fluid +import paddle.fluid.core as core import paddle import sys import numpy @@ -50,11 +51,14 @@ def optimizer_func(): return fluid.optimizer.Adam(learning_rate=0.001) -def train(use_cuda, train_program, params_dirname): +def train(use_cuda, train_program, params_dirname, parallel): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer_func=optimizer_func) + train_func=train_program, + place=place, + optimizer_func=optimizer_func, + parallel=parallel) def event_handler(event): if isinstance(event, fluid.EndEpochEvent): @@ -86,11 +90,14 @@ def train(use_cuda, train_program, params_dirname): feed_order=['img', 'label']) -def infer(use_cuda, inference_program, params_dirname=None): +def infer(use_cuda, inference_program, parallel, params_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) + infer_func=inference_program, + param_path=params_dirname, + place=place, + parallel=parallel) batch_size = 1 tensor_img = numpy.random.uniform(-1.0, 1.0, @@ -101,20 +108,32 @@ def infer(use_cuda, inference_program, params_dirname=None): print("infer results: ", results[0]) -def main(use_cuda): +def main(use_cuda, parallel): params_dirname = "recognize_digits_mlp.inference.model" # call train() with is_local argument to run distributed train + os.environ['CPU_NUM'] = str(4) train( use_cuda=use_cuda, train_program=train_program, - params_dirname=params_dirname) + params_dirname=params_dirname, + parallel=parallel) + + # FIXME(zcd): in the inference stage, the number of + # input data is one, it is not appropriate to use parallel. + if parallel and use_cuda: + return + os.environ['CPU_NUM'] = str(1) infer( use_cuda=use_cuda, inference_program=inference_program, - params_dirname=params_dirname) + params_dirname=params_dirname, + parallel=parallel) if __name__ == '__main__': - # for use_cuda in (False, True): - main(use_cuda=False) + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index ec0bf3ff8d64345111537780aaa5367ed0e1f8ff..e2564763d19d180f7c6933429dddf58c77be7bb8 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase): gt_box = layers.data( name='gt_box', shape=[4], lod_level=1, dtype='float32') - predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign( + pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( loc=loc, scores=scores, anchor_box=anchor_box, @@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase): rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) - self.assertIsNotNone(predicted_scores) - self.assertIsNotNone(predicted_location) - self.assertIsNotNone(target_label) - self.assertIsNotNone(target_bbox) - assert predicted_scores.shape[1] == 2 - assert predicted_location.shape[1] == 4 - assert predicted_location.shape[1] == target_bbox.shape[1] - - print(str(program)) + self.assertIsNotNone(pred_scores) + self.assertIsNotNone(pred_loc) + self.assertIsNotNone(tgt_lbl) + self.assertIsNotNone(tgt_bbox) + assert pred_scores.shape[1] == 1 + assert pred_loc.shape[1] == 4 + assert pred_loc.shape[1] == tgt_bbox.shape[1] class TestGenerateProposals(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py index 764f83b534c8a183dbf21511f0b05741c13c9528..36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -37,7 +37,7 @@ def fusion_gru( h0, wh, np.zeros( - (1, wh.shape[1]), dtype='float64'), + (1, wh.shape[1]), dtype='float32'), is_reverse, act_state, act_gate) @@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest): T = sum(self.lod[0]) N = len(self.lod[0]) - x = np.random.rand(T, self.M).astype('float64') - wx = np.random.rand(self.M, 3 * self.D).astype('float64') - wh = np.random.rand(self.D, 3 * self.D).astype('float64') + x = np.random.rand(T, self.M).astype('float32') + wx = np.random.rand(self.M, 3 * self.D).astype('float32') + wh = np.random.rand(self.D, 3 * self.D).astype('float32') bias = np.random.rand( - 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( - (1, 3 * self.D), dtype='float64') + 1, 3 * self.D).astype('float32') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float32') h0 = np.random.rand( - N, self.D).astype('float64') if self.with_h0 else np.zeros( - (N, self.D), dtype='float64') + N, self.D).astype('float32') if self.with_h0 else np.zeros( + (N, self.D), dtype='float32') _, _, _, hidden = fusion_gru( x, self.lod, h0, wx, wh, bias, self.is_reverse, @@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output() class TestFusionGRUOpNoInitial(TestFusionGRUOp): diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py index 5805bdf461998e90611dec05b079cd55feda520d..1f1eb37667e304351a6a85edde09e7da32cf1630 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -114,7 +114,9 @@ class TestFusionLSTMOp(OpTest): } def test_check_output(self): - self.check_output() + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output() class TestFusionLSTMOpInit(TestFusionLSTMOp): diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py index ce766fffbce98a6a2cee4c508d6db85ee0163401..6dc101b6dad8813893c6a891da0e16f952bb4c2d 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py @@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights): dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0] dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1] - dw = (np.log(gt_w / ex_w)) / ex_w / weights[2] - dh = (np.log(gt_h / ex_h)) / ex_h / weights[3] + dw = (np.log(gt_w / ex_w)) / weights[2] + dh = (np.log(gt_h / ex_h)) / weights[3] targets = np.vstack([dx, dy, dw, dh]).transpose() return targets diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ecdf32524afb1357b192ce14674b7073972dee9f..bc4d364c74c6cb6b8f0df59e7ede77e6271f4b96 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -549,6 +549,13 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_sequence_enumerate(self): + program = Program() + with program_guard(program): + x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1) + out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py index ed7f467835f32242a9650f226b4a5ad9d6d87af4..ad4cd2e803bfae4c3fbc04503331b9a786b25d17 100644 --- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py @@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest): for i in range(self.rois_num): roi = self.rois[i] - roi_batch_id = roi[0] + roi_batch_id = int(roi[0]) roi_start_w = int(cpt.round(roi[1] * self.spatial_scale)) roi_start_h = int(cpt.round(roi[2] * self.spatial_scale)) roi_end_w = int(cpt.round(roi[3] * self.spatial_scale)) @@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest): roi = [bno, x1, y1, x2, y2] rois.append(roi) self.rois_num = len(rois) - self.rois = np.array(rois).astype("int64") + self.rois = np.array(rois).astype("float32") def setUp(self): self.op_type = "roi_pool" diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index 08c462d9036cacab81dab7c9ea16664c9159479f..bd548009b3ada9512e4b5f7d7b61b67b0717a39b 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -18,12 +18,17 @@ import unittest import numpy as np import paddle.fluid.core as core from op_test import OpTest +from test_anchor_generator_op import anchor_generator_in_python +from test_generate_proposal_labels import _generate_groundtruth +from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta -def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction): - iou = np.transpose(iou) +def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, fg_fraction): + iou = np.transpose(gt_anchor_iou) anchor_to_gt_max = iou.max(axis=1) + anchor_to_gt_argmax = iou.argmax(axis=1) + gt_to_anchor_argmax = iou.argmax(axis=0) gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] @@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] + tgt_lbl[bg_inds] = 0 if len(bg_inds) > num_bg: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] tgt_lbl[enable_inds] = 0 bg_inds = np.where(tgt_lbl == 0)[0] + tgt_lbl[bg_inds] = 0 loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) tgt_lbl = np.expand_dims(tgt_lbl, axis=1) - return loc_index, score_index, tgt_lbl + + gt_inds = anchor_to_gt_argmax[fg_inds] + + return loc_index, score_index, tgt_lbl, gt_inds + + +def get_anchor(n, c, h, w): + input_feat = np.random.random((n, c, h, w)).astype('float32') + anchors, _ = anchor_generator_in_python( + input_feat=input_feat, + anchor_sizes=[32., 64.], + aspect_ratios=[0.5, 1.0], + variances=[1.0, 1.0, 1.0, 1.0], + stride=[16.0, 16.0], + offset=0.5) + return anchors + + +def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, fg_fraction): + + loc_indexes = [] + score_indexes = [] + tmp_tgt_labels = [] + tgt_bboxes = [] + anchor_num = anchor.shape[0] + + batch_size = len(lod) - 1 + for i in range(batch_size): + b, e = lod[i], lod[i + 1] + iou_slice = iou[b:e, :] + bboxes_slice = gt_boxes[b:e, :] + + loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( + iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, fg_fraction) + + fg_bboxes = bboxes_slice[gt_inds] + fg_anchors = anchor[loc_idx] + box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + + if i == 0: + loc_indexes = loc_idx + score_indexes = score_idx + tmp_tgt_labels = tgt_lbl + tgt_bboxes = box_deltas + else: + loc_indexes = np.concatenate( + [loc_indexes, loc_idx + i * anchor_num]) + score_indexes = np.concatenate( + [score_indexes, score_idx + i * anchor_num]) + tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) + + tgt_labels = tmp_tgt_labels[score_indexes] + return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): - iou = np.random.random((10, 8)).astype("float32") - self.op_type = "rpn_target_assign" - self.inputs = {'DistMat': iou} - self.attrs = { - 'rpn_batch_size_per_im': 256, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.3, - 'fg_fraction': 0.25, - 'fix_seed': True - } - loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3, - 0.25) - self.outputs = { - 'LocationIndex': loc_index, - 'ScoreIndex': score_index, - 'TargetLabel': tgt_lbl, - } + n, c, h, w = 2, 4, 14, 14 + anchor = get_anchor(n, c, h, w) + gt_num = 10 + anchor = anchor.reshape(-1, 4) + anchor_num = anchor.shape[0] - def test_check_output(self): - self.check_output() + im_shapes = [[64, 64], [64, 64]] + gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) + bbox = np.vstack([v['boxes'] for v in gt_box]) + iou = _bbox_overlaps(bbox, anchor) + + anchor = anchor.astype('float32') + bbox = bbox.astype('float32') + iou = iou.astype('float32') + + loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( + anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) -class TestRpnTargetAssignOp2(OpTest): - def setUp(self): - iou = np.random.random((10, 20)).astype("float32") self.op_type = "rpn_target_assign" - self.inputs = {'DistMat': iou} + self.inputs = { + 'Anchor': anchor, + 'GtBox': (bbox, [[4, 4]]), + 'DistMat': (iou, [[4, 4]]), + } self.attrs = { - 'rpn_batch_size_per_im': 128, - 'rpn_positive_overlap': 0.5, - 'rpn_negative_overlap': 0.5, - 'fg_fraction': 0.5, + 'rpn_batch_size_per_im': 25600, + 'rpn_positive_overlap': 0.95, + 'rpn_negative_overlap': 0.03, + 'fg_fraction': 0.25, 'fix_seed': True } - loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5, - 0.5) self.outputs = { - 'LocationIndex': loc_index, - 'ScoreIndex': score_index, - 'TargetLabel': tgt_lbl, + 'LocationIndex': loc_index.astype('int32'), + 'ScoreIndex': score_index.astype('int32'), + 'TargetBBox': tgt_bbox.astype('float32'), + 'TargetLabel': tgt_lbl.astype('int64'), } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py new file mode 100644 index 0000000000000000000000000000000000000000..9814ec0a15e1803b356f300d378c31e57ba36c09 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py @@ -0,0 +1,105 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +def sequence_enumerate(input_seq, in_lod, win_size, pad_value): + lod0 = [0] + for i in range(0, len(in_lod[0])): + lod0.append(lod0[i] + in_lod[0][i]) + out_seq = [] + for i in range(0, len(lod0) - 1): + for idx in range(lod0[i], lod0[i + 1]): + single_seq = [] + for word_idx in range(win_size): + word_pos = idx + word_idx + dat = input_seq[word_pos] if word_pos < lod0[i+1] \ + else pad_value + single_seq.append(dat) + out_seq.append(single_seq) + return out_seq + + +class TestSequenceEnumerateOp(OpTest): + def setUp(self): + self.op_type = "sequence_enumerate" + self.init_test_case() + self.inputs = {'X': (self.in_seq, self.lod)} + self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value} + self.outputs = {'Out': (self.out_seq, self.lod)} + + def test_check_output(self): + self.check_output() + + def init_test_case(self): + self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") + self.lod = [[9, 4, 11, 6]] + self.win_size = 2 + self.pad_value = 0 + out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size, + self.pad_value) + self.out_seq = np.array(out_seq).astype("int32") + + +class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp): + def init_test_case(self): + self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64") + self.lod = [[9, 4, 11, 6]] + self.win_size = 2 + self.pad_value = 0 + out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size, + self.pad_value) + self.out_seq = np.array(out_seq).astype("int64") + + +class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp): + def init_test_case(self): + self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") + self.lod = [[9, 4, 11, 6]] + self.win_size = 5 + self.pad_value = 0 + out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size, + self.pad_value) + self.out_seq = np.array(out_seq).astype("int32") + + +class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp): + def init_test_case(self): + self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") + self.lod = [[9, 4, 11, 6]] + self.win_size = 30 + self.pad_value = 0 + out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size, + self.pad_value) + self.out_seq = np.array(out_seq).astype("int32") + + +class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp): + def init_test_case(self): + self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32") + self.lod = [[9, 4, 11, 6]] + self.win_size = 5 + self.pad_value = 5 + out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size, + self.pad_value) + self.out_seq = np.array(out_seq).astype("int32") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index bddeb6617c1743de946b3c5b4b0a465d85f35ce3..8a330e0dee7eda02d0858446778363f2235a3d73 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -1096,7 +1096,8 @@ class DistributeTranspiler(object): self.table_name] zero_dim = int( - math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints))) + math.ceil(origin_param_var.shape[0] / float( + len(self.pserver_endpoints)))) table_shape = list(origin_param_var.shape) table_shape[0] = zero_dim