diff --git a/.travis.yml b/.travis.yml index 361136ac2c8d899a0d7a4d7945083fcc489551b5..8c2d9f143b3102c142ec2d5d193b82936d04fba8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,6 @@ services: os: - linux env: - - JOB=doc - JOB=check_style - JOB=build_android addons: diff --git a/README.md b/README.md index 45186ec4ef48dc305b2616dbf4966f01c3609962..46fdef5e376d3f5bf49ef10c62f5b3a6637913c1 100644 --- a/README.md +++ b/README.md @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==0.14.0.post87 +pip install paddlepaddle-gpu==0.15.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==0.14.0.post85 +pip install paddlepaddle-gpu==0.15.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 82c424fb79d5596c31891bc395699bf9ff4e7e7e..dc5427acd45f5da90317e7a3dc25f5453e2a7a00 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -29,7 +29,7 @@ INCLUDE(ExternalProject) SET(MKLML_PROJECT "extern_mklml") IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) MESSAGE(STATUS "use pre defined download url") - SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE) + SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) ENDIF() MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 6d230942321f8d82a14f5c58037134deb0ab222d..a67512578147fc7223714dbc4cd124b831fb4775 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -211,7 +211,7 @@ function(merge_static_libs TARGET_NAME) set(libfiles ${libfiles} $) #endif() endforeach() - + # windows cmd return error in clean env. # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD @@ -255,7 +255,7 @@ function(cc_library TARGET_NAME) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() - + # cpplint code style foreach(source_file ${cc_library_SRCS}) string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) @@ -298,11 +298,10 @@ function(cc_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) - + endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - endif() endif() endfunction(cc_test) @@ -366,11 +365,10 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) - + endif() set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - endif() endif() endfunction(nv_test) @@ -558,26 +556,26 @@ function(paddle_protobuf_generate_cpp SRCS HDRS) set(${HDRS}) if (MOBILE_INFERENCE) - set(EXTRA_FLAG "lite:") + set(EXTRA_FLAG "lite:") else() - set(EXTRA_FLAG "") + set(EXTRA_FLAG "") endif() foreach(FIL ${ARGN}) get_filename_component(ABS_FIL ${FIL} ABSOLUTE) get_filename_component(FIL_WE ${FIL} NAME_WE) - + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") list(APPEND ${SRCS} "${_protobuf_protoc_src}") list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") - + add_custom_command( OUTPUT "${_protobuf_protoc_src}" "${_protobuf_protoc_hdr}" COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" - COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_CURRENT_SOURCE_DIR} --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} DEPENDS ${ABS_FIL} protoc @@ -646,7 +644,7 @@ function(grpc_library TARGET_NAME) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but - # somehow it didn't. line 602 to 604 is to patching this. Leaving this here + # somehow it didn't. line 602 to 604 is to patching this. Leaving this here # for now to enable dist CI. protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 8f65a737c43a124c05574d6eb9c3050fdab5299a..fa0e834a1dfd6e60f0ec07945be9a4d84017316f 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -18,7 +18,7 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) if(WITH_DSO) set(TENSORRT_FOUND ON) - endif(WITH DSO) + endif(WITH_DSO) else() set(TENSORRT_FOUND OFF) endif() diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..77aa2a5322057d582435e83f38476833d1f73c48 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,7 @@ +# For Readers and Developers + +Thanks for reading PaddlePaddle documentation. + +Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo. + +Please turn to Fluiddoc Repo for the latest documentation. diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt deleted file mode 100644 index be92af3902769a65c77953c9f3cb1f3aa3738d79..0000000000000000000000000000000000000000 --- a/doc/fluid/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -if(NOT DEFINED SPHINX_THEME) - set(SPHINX_THEME default) -endif() - -if(NOT DEFINED SPHINX_THEME_DIR) - set(SPHINX_THEME_DIR) -endif() - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") - -set(IMPORT_PADDLE_STRING "") -set(IMPORT_PADDLEV2_STRING "") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" - "${BINARY_BUILD_DIR_EN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_fluid_docs - html - ${BINARY_BUILD_DIR_EN} - ${SPHINX_CACHE_DIR_EN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_EN}) - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees") - -# HTML output directory -set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in" - "${BINARY_BUILD_DIR_CN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_fluid_docs_cn - html - ${BINARY_BUILD_DIR_CN} - ${SPHINX_CACHE_DIR_CN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_CN}) - -add_subdirectory(api) diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt deleted file mode 100644 index 435d6e10fb02e9b2a8147f37da33e8848cc9b98a..0000000000000000000000000000000000000000 --- a/doc/fluid/api/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") - -set(IMPORT_PADDLE_STRING "import paddle") -set(IMPORT_PADDLEV2_STRING "import paddle.v2") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in" - "${BINARY_BUILD_DIR_EN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_fluid_apis - html - ${BINARY_BUILD_DIR_EN} - ${SPHINX_CACHE_DIR_EN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_EN}) - -add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python) diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst deleted file mode 100644 index 496f5b29875443f0c44f50fcb3ca837f4e7bcd12..0000000000000000000000000000000000000000 --- a/doc/fluid/api/average.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============= -fluid.average -============= - -.. _api_fluid_average_WeightedAverage: - -WeightedAverage ---------------- - -.. autoclass:: paddle.fluid.average.WeightedAverage - :members: - :noindex: - diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst deleted file mode 100644 index 115e0d24b39928cfc349f72e0a21d6374cd8cd75..0000000000000000000000000000000000000000 --- a/doc/fluid/api/backward.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============== -fluid.backward -============== - -.. _api_fluid_backward_append_backward: - -append_backward ---------------- - -.. autofunction:: paddle.fluid.backward.append_backward - :noindex: - -.. _api_fluid_backward_calc_gradient: - -calc_gradient -------------- - -.. autofunction:: paddle.fluid.backward.calc_gradient - :noindex: - diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst deleted file mode 100644 index aeefbb95a46e5d5ed46375e388a720fad2711779..0000000000000000000000000000000000000000 --- a/doc/fluid/api/clip.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -========== -fluid.clip -========== - -.. _api_fluid_clip_ErrorClipByValue: - -ErrorClipByValue ----------------- - -.. autoclass:: paddle.fluid.clip.ErrorClipByValue - :members: - :noindex: - -.. _api_fluid_clip_GradientClipByValue: - -GradientClipByValue -------------------- - -.. autoclass:: paddle.fluid.clip.GradientClipByValue - :members: - :noindex: - -.. _api_fluid_clip_GradientClipByNorm: - -GradientClipByNorm ------------------- - -.. autoclass:: paddle.fluid.clip.GradientClipByNorm - :members: - :noindex: - -.. _api_fluid_clip_GradientClipByGlobalNorm: - -GradientClipByGlobalNorm ------------------------- - -.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm - :members: - :noindex: - diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst deleted file mode 100644 index 1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0..0000000000000000000000000000000000000000 --- a/doc/fluid/api/data/data_reader.rst +++ /dev/null @@ -1,72 +0,0 @@ -===================== -Data Reader Interface -===================== - - -DataTypes -========= - -.. autofunction:: paddle.v2.data_type.dense_array - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_non_value_slot - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_value_slot - :noindex: - -.. autoclass:: paddle.v2.data_type.InputType - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.reader - :members: - :noindex: - -.. automodule:: paddle.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst deleted file mode 100644 index e7c8be4452bf55e0967d750c2e624e8e316e9330..0000000000000000000000000000000000000000 --- a/doc/fluid/api/data/dataset.rst +++ /dev/null @@ -1,82 +0,0 @@ -Dataset -======= - -.. automodule:: paddle.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.dataset.wmt14 - :members: - :noindex: - -wmt16 -+++++ - -.. automodule:: paddle.dataset.wmt16 - :members: - :noindex: diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst deleted file mode 100644 index 97651ffa6be56cf3ecaca2caca38a353fa5c1f49..0000000000000000000000000000000000000000 --- a/doc/fluid/api/data/image.rst +++ /dev/null @@ -1,5 +0,0 @@ -Image Interface -=============== - -.. automodule:: paddle.v2.image - :members: diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst deleted file mode 100644 index 11d2890f5b3446e37c3ef31e5a17ebebe169dbc8..0000000000000000000000000000000000000000 --- a/doc/fluid/api/data_feeder.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -================= -fluid.data_feeder -================= - -.. _api_fluid_data_feeder_DataFeeder: - -DataFeeder ----------- - -.. autoclass:: paddle.fluid.data_feeder.DataFeeder - :members: - :noindex: - diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst deleted file mode 100644 index f23ecc1f80030f20359ce9675130a167722606c9..0000000000000000000000000000000000000000 --- a/doc/fluid/api/executor.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============== -fluid.executor -============== - -.. _api_fluid_executor_Executor: - -Executor --------- - -.. autoclass:: paddle.fluid.executor.Executor - :members: - :noindex: - -.. _api_fluid_executor_global_scope: - -global_scope ------------- - -.. autofunction:: paddle.fluid.executor.global_scope - :noindex: - -.. _api_fluid_executor_scope_guard: - -scope_guard ------------ - -.. autofunction:: paddle.fluid.executor.scope_guard - :noindex: - -.. _api_fluid_executor__switch_scope: - -_switch_scope -------------- - -.. autofunction:: paddle.fluid.executor._switch_scope - :noindex: - diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst deleted file mode 100644 index 7eab58355c3648d929d3b5d98984adce9034f016..0000000000000000000000000000000000000000 --- a/doc/fluid/api/fluid.rst +++ /dev/null @@ -1,362 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -===== -fluid -===== - -.. _api_fluid_Block: - -Block ------ - -.. autoclass:: paddle.fluid.Block - :members: - :noindex: - -.. _api_fluid_Variable: - -Variable --------- - -.. autoclass:: paddle.fluid.Variable - :members: - :noindex: - -.. _api_fluid_Program: - -Program -------- - -.. autoclass:: paddle.fluid.Program - :members: - :noindex: - -.. _api_fluid_Operator: - -Operator --------- - -.. autoclass:: paddle.fluid.Operator - :members: - :noindex: - -.. _api_fluid_default_startup_program: - -default_startup_program ------------------------ - -.. autofunction:: paddle.fluid.default_startup_program - :noindex: - -.. _api_fluid_default_main_program: - -default_main_program --------------------- - -.. autofunction:: paddle.fluid.default_main_program - :noindex: - -.. _api_fluid_program_guard: - -program_guard -------------- - -.. autofunction:: paddle.fluid.program_guard - :noindex: - -.. _api_fluid_get_var: - -get_var -------- - -.. autofunction:: paddle.fluid.get_var - :noindex: - -.. _api_fluid_Executor: - -Executor --------- - -.. autoclass:: paddle.fluid.Executor - :members: - :noindex: - -.. _api_fluid_global_scope: - -global_scope ------------- - -.. autofunction:: paddle.fluid.global_scope - :noindex: - -.. _api_fluid_scope_guard: - -scope_guard ------------ - -.. autofunction:: paddle.fluid.scope_guard - :noindex: - -.. _api_fluid__switch_scope: - -_switch_scope -------------- - -.. autofunction:: paddle.fluid._switch_scope - :noindex: - - -.. _api_fluid_make_channel: - -make_channel ------------- - -.. autofunction:: paddle.fluid.make_channel - :noindex: - -.. _api_fluid_channel_send: - -channel_send ------------- - -.. autofunction:: paddle.fluid.channel_send - :noindex: - -.. _api_fluid_channel_recv: - -channel_recv ------------- - -.. autofunction:: paddle.fluid.channel_recv - :noindex: - -.. _api_fluid_channel_close: - -channel_close -------------- - -.. autofunction:: paddle.fluid.channel_close - :noindex: - -.. _api_fluid_Select: - -Select ------- - -.. autoclass:: paddle.fluid.Select - :members: - :noindex: - -.. _api_fluid_Trainer: - -Trainer -------- - -.. autoclass:: paddle.fluid.Trainer - :members: - :noindex: - -.. _api_fluid_BeginEpochEvent: - -BeginEpochEvent ---------------- - -.. autoclass:: paddle.fluid.BeginEpochEvent - :members: - :noindex: - -.. _api_fluid_EndEpochEvent: - -EndEpochEvent -------------- - -.. autoclass:: paddle.fluid.EndEpochEvent - :members: - :noindex: - -.. _api_fluid_BeginStepEvent: - -BeginStepEvent --------------- - -.. autoclass:: paddle.fluid.BeginStepEvent - :members: - :noindex: - -.. _api_fluid_EndStepEvent: - -EndStepEvent ------------- - -.. autoclass:: paddle.fluid.EndStepEvent - :members: - :noindex: - -.. _api_fluid_CheckpointConfig: - -CheckpointConfig ----------------- - -.. autoclass:: paddle.fluid.CheckpointConfig - :members: - :noindex: - -.. _api_fluid_Inferencer: - -Inferencer ----------- - -.. autoclass:: paddle.fluid.Inferencer - :members: - :noindex: - -.. _api_fluid_DistributeTranspiler: - -DistributeTranspiler --------------------- - -.. autoclass:: paddle.fluid.DistributeTranspiler - :members: - :noindex: - -.. _api_fluid_memory_optimize: - -memory_optimize ---------------- - -.. autofunction:: paddle.fluid.memory_optimize - :noindex: - -.. _api_fluid_release_memory: - -release_memory --------------- - -.. autofunction:: paddle.fluid.release_memory - :noindex: - -.. _api_fluid_ParallelExecutor: - -ParallelExecutor ----------------- - -.. autoclass:: paddle.fluid.ParallelExecutor - :members: - :noindex: - -.. _api_fluid_ExecutionStrategy: - -ExecutionStrategy ------------------ - -.. autoclass:: paddle.fluid.ExecutionStrategy - :members: - :noindex: - -.. _api_fluid_BuildStrategy: - -BuildStrategy -------------- - -.. autoclass:: paddle.fluid.BuildStrategy - :members: - :noindex: - -.. _api_fluid_create_lod_tensor: - -create_lod_tensor ------------------ - -.. autofunction:: paddle.fluid.create_lod_tensor - :noindex: - -.. _api_fluid_create_random_int_lodtensor: - -create_random_int_lodtensor ---------------------------- - -.. autofunction:: paddle.fluid.create_random_int_lodtensor - :noindex: - -.. _api_fluid_LoDTensor: - -LoDTensor ---------- - -.. autoclass:: paddle.fluid.LoDTensor - :members: - :noindex: - -.. _api_fluid_CPUPlace: - -CPUPlace --------- - -.. autoclass:: paddle.fluid.CPUPlace - :members: - :noindex: - -.. _api_fluid_CUDAPlace: - -CUDAPlace ---------- - -.. autoclass:: paddle.fluid.CUDAPlace - :members: - :noindex: - -.. _api_fluid_CUDAPinnedPlace: - -CUDAPinnedPlace ---------------- - -.. autoclass:: paddle.fluid.CUDAPinnedPlace - :members: - :noindex: - -.. _api_fluid_Tensor: - -Tensor ------- - -.. autoclass:: paddle.fluid.Tensor - :members: - :noindex: - -.. _api_fluid_ParamAttr: - -ParamAttr ---------- - -.. autoclass:: paddle.fluid.ParamAttr - :members: - :noindex: - -.. _api_fluid_WeightNormParamAttr: - -WeightNormParamAttr -------------------- - -.. autoclass:: paddle.fluid.WeightNormParamAttr - :members: - :noindex: - -.. _api_fluid_DataFeeder: - -DataFeeder ----------- - -.. autoclass:: paddle.fluid.DataFeeder - :members: - :noindex: - -.. _api_fluid_Scope: - -Scope ------ - -.. autoclass:: paddle.fluid.Scope - :members: - :noindex: - diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py deleted file mode 100644 index 02efce2bf8392c62a7600c272bedcadc6563f927..0000000000000000000000000000000000000000 --- a/doc/fluid/api/gen_doc.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import argparse -import sys -import types - -import paddle.fluid as fluid - - -def parse_arg(): - parser = argparse.ArgumentParser() - parser.add_argument('--submodules', nargs="*") - parser.add_argument( - 'module', type=str, help='Generate the documentation of which module') - return parser.parse_args() - - -class DocGenerator(object): - def __init__(self, module_name=None, stream=sys.stdout): - if module_name == "": - module_name = None - self.stream = stream - if module_name is None: - self.module_name = "fluid" - else: - self.module_name = "fluid." + module_name - if module_name is None: - self.module = fluid - else: - if not hasattr(fluid, module_name): - raise ValueError("Cannot find fluid.{0}".format(module_name)) - else: - self.module = getattr(fluid, module_name) - self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -''') - - self._print_header_(self.module_name, dot='=', is_title=True) - - def print_submodule(self, submodule_name): - submodule = getattr(self.module, submodule_name) - if submodule is None: - raise ValueError("Cannot find submodule {0}".format(submodule_name)) - self.print_section(submodule_name) - - for item in submodule.__all__: - self.print_item(item) - - def print_current_module(self): - for item in self.module.__all__: - self.print_item(item) - - def print_section(self, name): - self._print_header_(name, dot='=', is_title=False) - - def print_item(self, name): - item = getattr(self.module, name, None) - if item is None: - return - if isinstance(item, types.TypeType): - self.print_class(name) - elif isinstance(item, types.FunctionType): - self.print_method(name) - else: - pass - - def print_class(self, name): - self._print_ref_(name) - self._print_header_(name, dot='-', is_title=False) - self.stream.write('''.. autoclass:: paddle.{0}.{1} - :members: - :noindex: - -'''.format(self.module_name, name)) - - def print_method(self, name): - self._print_ref_(name) - self._print_header_(name, dot='-', is_title=False) - self.stream.write('''.. autofunction:: paddle.{0}.{1} - :noindex: - -'''.format(self.module_name, name)) - - def _print_header_(self, name, dot, is_title): - dot_line = dot * len(name) - if is_title: - self.stream.write(dot_line) - self.stream.write('\n') - self.stream.write(name) - self.stream.write('\n') - self.stream.write(dot_line) - self.stream.write('\n') - self.stream.write('\n') - - def _print_ref_(self, name): - self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join( - self.module_name.split(".")), name)) - - -def main(): - args = parse_arg() - gen = DocGenerator(args.module) - if args.submodules is None: - gen.print_current_module() - else: - for submodule_name in args.submodules: - gen.print_submodule(submodule_name) - - -if __name__ == '__main__': - main() diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh deleted file mode 100755 index b14ee29873c50fd011f6c48b754767ac8918252a..0000000000000000000000000000000000000000 --- a/doc/fluid/api/gen_doc.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst - -for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler -do - python gen_doc.py ${module} > ${module}.rst -done - -python gen_doc.py "" > fluid.rst diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst deleted file mode 100644 index 359406819a993e7eaf2155c839373df44d97b103..0000000000000000000000000000000000000000 --- a/doc/fluid/api/index_en.rst +++ /dev/null @@ -1,26 +0,0 @@ -============= -API Reference -============= - -.. toctree:: - :maxdepth: 1 - - fluid.rst - layers.rst - data_feeder.rst - executor.rst - initializer.rst - metrics.rst - nets.rst - clip.rst - optimizer.rst - param_attr.rst - profiler.rst - regularizer.rst - io.rst - data.rst - transpiler.rst - recordio_writer.rst - backward.rst - average.rst - profiler.rst diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst deleted file mode 100644 index dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607..0000000000000000000000000000000000000000 --- a/doc/fluid/api/initializer.rst +++ /dev/null @@ -1,131 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -================= -fluid.initializer -================= - -.. _api_fluid_initializer_Constant: - -Constant --------- - -.. autoclass:: paddle.fluid.initializer.Constant - :members: - :noindex: - -.. _api_fluid_initializer_Uniform: - -Uniform -------- - -.. autoclass:: paddle.fluid.initializer.Uniform - :members: - :noindex: - -.. _api_fluid_initializer_Normal: - -Normal ------- - -.. autoclass:: paddle.fluid.initializer.Normal - :members: - :noindex: - -.. _api_fluid_initializer_Xavier: - -Xavier ------- - -.. autoclass:: paddle.fluid.initializer.Xavier - :members: - :noindex: - -.. _api_fluid_initializer_Bilinear: - -Bilinear --------- - -.. autoclass:: paddle.fluid.initializer.Bilinear - :members: - :noindex: - -.. _api_fluid_initializer_MSRA: - -MSRA ----- - -.. autoclass:: paddle.fluid.initializer.MSRA - :members: - :noindex: - -.. _api_fluid_initializer_force_init_on_cpu: - -force_init_on_cpu ------------------ - -.. autofunction:: paddle.fluid.initializer.force_init_on_cpu - :noindex: - -.. _api_fluid_initializer_init_on_cpu: - -init_on_cpu ------------ - -.. autofunction:: paddle.fluid.initializer.init_on_cpu - :noindex: - -.. _api_fluid_initializer_ConstantInitializer: - -ConstantInitializer -------------------- - -.. autoclass:: paddle.fluid.initializer.ConstantInitializer - :members: - :noindex: - -.. _api_fluid_initializer_UniformInitializer: - -UniformInitializer ------------------- - -.. autoclass:: paddle.fluid.initializer.UniformInitializer - :members: - :noindex: - -.. _api_fluid_initializer_NormalInitializer: - -NormalInitializer ------------------ - -.. autoclass:: paddle.fluid.initializer.NormalInitializer - :members: - :noindex: - -.. _api_fluid_initializer_XavierInitializer: - -XavierInitializer ------------------ - -.. autoclass:: paddle.fluid.initializer.XavierInitializer - :members: - :noindex: - -.. _api_fluid_initializer_BilinearInitializer: - -BilinearInitializer -------------------- - -.. autoclass:: paddle.fluid.initializer.BilinearInitializer - :members: - :noindex: - -.. _api_fluid_initializer_MSRAInitializer: - -MSRAInitializer ---------------- - -.. autoclass:: paddle.fluid.initializer.MSRAInitializer - :members: - :noindex: - diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst deleted file mode 100644 index 7cee0bc4d9aa2c51517d23a381f14a8f63cc3681..0000000000000000000000000000000000000000 --- a/doc/fluid/api/io.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -======== -fluid.io -======== - -.. _api_fluid_io_save_vars: - -save_vars ---------- - -.. autofunction:: paddle.fluid.io.save_vars - :noindex: - -.. _api_fluid_io_save_params: - -save_params ------------ - -.. autofunction:: paddle.fluid.io.save_params - :noindex: - -.. _api_fluid_io_save_persistables: - -save_persistables ------------------ - -.. autofunction:: paddle.fluid.io.save_persistables - :noindex: - -.. _api_fluid_io_load_vars: - -load_vars ---------- - -.. autofunction:: paddle.fluid.io.load_vars - :noindex: - -.. _api_fluid_io_load_params: - -load_params ------------ - -.. autofunction:: paddle.fluid.io.load_params - :noindex: - -.. _api_fluid_io_load_persistables: - -load_persistables ------------------ - -.. autofunction:: paddle.fluid.io.load_persistables - :noindex: - -.. _api_fluid_io_save_inference_model: - -save_inference_model --------------------- - -.. autofunction:: paddle.fluid.io.save_inference_model - :noindex: - -.. _api_fluid_io_load_inference_model: - -load_inference_model --------------------- - -.. autofunction:: paddle.fluid.io.load_inference_model - :noindex: - -.. _api_fluid_io_get_inference_program: - -get_inference_program ---------------------- - -.. autofunction:: paddle.fluid.io.get_inference_program - :noindex: - -.. _api_fluid_io_save_checkpoint: - -save_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.save_checkpoint - :noindex: - -.. _api_fluid_io_load_checkpoint: - -load_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.load_checkpoint - :noindex: - -.. _api_fluid_io_clean_checkpoint: - -clean_checkpoint ----------------- - -.. autofunction:: paddle.fluid.io.clean_checkpoint - :noindex: - -.. _api_fluid_io_load_persist_vars_without_grad: - -load_persist_vars_without_grad ------------------------------- - -.. autofunction:: paddle.fluid.io.load_persist_vars_without_grad - :noindex: - -.. _api_fluid_io_save_persist_vars_without_grad: - -save_persist_vars_without_grad ------------------------------- - -.. autofunction:: paddle.fluid.io.save_persist_vars_without_grad - :noindex: - -.. _api_fluid_io_get_latest_checkpoint_serial: - -get_latest_checkpoint_serial ----------------------------- - -.. autofunction:: paddle.fluid.io.get_latest_checkpoint_serial - :noindex: - diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst deleted file mode 100644 index 6f0267cd7a1d0afcdcb1596a46ffe2d15eea100d..0000000000000000000000000000000000000000 --- a/doc/fluid/api/layers.rst +++ /dev/null @@ -1,1794 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============ -fluid.layers -============ - -control_flow -============ - -.. _api_fluid_layers_split_lod_tensor: - -split_lod_tensor ----------------- - -.. autofunction:: paddle.fluid.layers.split_lod_tensor - :noindex: - -.. _api_fluid_layers_merge_lod_tensor: - -merge_lod_tensor ----------------- - -.. autofunction:: paddle.fluid.layers.merge_lod_tensor - :noindex: - -.. _api_fluid_layers_BlockGuard: - -BlockGuard ----------- - -.. autoclass:: paddle.fluid.layers.BlockGuard - :members: - :noindex: - -.. _api_fluid_layers_BlockGuardWithCompletion: - -BlockGuardWithCompletion ------------------------- - -.. autoclass:: paddle.fluid.layers.BlockGuardWithCompletion - :members: - :noindex: - -.. _api_fluid_layers_WhileGuard: - -WhileGuard ----------- - -.. autoclass:: paddle.fluid.layers.WhileGuard - :members: - :noindex: - -.. _api_fluid_layers_While: - -While ------ - -.. autoclass:: paddle.fluid.layers.While - :members: - :noindex: - -.. _api_fluid_layers_Switch: - -Switch ------- - -.. autoclass:: paddle.fluid.layers.Switch - :members: - :noindex: - -.. _api_fluid_layers_lod_rank_table: - -lod_rank_table --------------- - -.. autofunction:: paddle.fluid.layers.lod_rank_table - :noindex: - -.. _api_fluid_layers_max_sequence_len: - -max_sequence_len ----------------- - -.. autofunction:: paddle.fluid.layers.max_sequence_len - :noindex: - -.. _api_fluid_layers_lod_tensor_to_array: - -lod_tensor_to_array -------------------- - -.. autofunction:: paddle.fluid.layers.lod_tensor_to_array - :noindex: - -.. _api_fluid_layers_array_to_lod_tensor: - -array_to_lod_tensor -------------------- - -.. autofunction:: paddle.fluid.layers.array_to_lod_tensor - :noindex: - -.. _api_fluid_layers_increment: - -increment ---------- - -.. autofunction:: paddle.fluid.layers.increment - :noindex: - -.. _api_fluid_layers_array_write: - -array_write ------------ - -.. autofunction:: paddle.fluid.layers.array_write - :noindex: - -.. _api_fluid_layers_create_array: - -create_array ------------- - -.. autofunction:: paddle.fluid.layers.create_array - :noindex: - -.. _api_fluid_layers_less_than: - -less_than ---------- - -.. autofunction:: paddle.fluid.layers.less_than - :noindex: - -.. _api_fluid_layers_equal: - -equal ------ - -.. autofunction:: paddle.fluid.layers.equal - :noindex: - -.. _api_fluid_layers_array_read: - -array_read ----------- - -.. autofunction:: paddle.fluid.layers.array_read - :noindex: - -.. _api_fluid_layers_shrink_memory: - -shrink_memory -------------- - -.. autofunction:: paddle.fluid.layers.shrink_memory - :noindex: - -.. _api_fluid_layers_array_length: - -array_length ------------- - -.. autofunction:: paddle.fluid.layers.array_length - :noindex: - -.. _api_fluid_layers_IfElse: - -IfElse ------- - -.. autoclass:: paddle.fluid.layers.IfElse - :members: - :noindex: - -.. _api_fluid_layers_DynamicRNN: - -DynamicRNN ----------- - -.. autoclass:: paddle.fluid.layers.DynamicRNN - :members: - :noindex: - -.. _api_fluid_layers_ConditionalBlock: - -ConditionalBlock ----------------- - -.. autoclass:: paddle.fluid.layers.ConditionalBlock - :members: - :noindex: - -.. _api_fluid_layers_StaticRNN: - -StaticRNN ---------- - -.. autoclass:: paddle.fluid.layers.StaticRNN - :members: - :noindex: - -.. _api_fluid_layers_reorder_lod_tensor_by_rank: - -reorder_lod_tensor_by_rank --------------------------- - -.. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank - :noindex: - -.. _api_fluid_layers_ParallelDo: - -ParallelDo ----------- - -.. autoclass:: paddle.fluid.layers.ParallelDo - :members: - :noindex: - -.. _api_fluid_layers_Print: - -Print ------ - -.. autofunction:: paddle.fluid.layers.Print - :noindex: - -.. _api_fluid_layers_is_empty: - -is_empty --------- - -.. autofunction:: paddle.fluid.layers.is_empty - :noindex: - -device -====== - -.. _api_fluid_layers_get_places: - -get_places ----------- - -.. autofunction:: paddle.fluid.layers.get_places - :noindex: - -io -== - -.. _api_fluid_layers_data: - -data ----- - -.. autofunction:: paddle.fluid.layers.data - :noindex: - -.. _api_fluid_layers_BlockGuardServ: - -BlockGuardServ --------------- - -.. autoclass:: paddle.fluid.layers.BlockGuardServ - :members: - :noindex: - -.. _api_fluid_layers_ListenAndServ: - -ListenAndServ -------------- - -.. autoclass:: paddle.fluid.layers.ListenAndServ - :members: - :noindex: - -.. _api_fluid_layers_Send: - -Send ----- - -.. autofunction:: paddle.fluid.layers.Send - :noindex: - -.. _api_fluid_layers_Recv: - -Recv ----- - -.. autofunction:: paddle.fluid.layers.Recv - :noindex: - -.. _api_fluid_layers_open_recordio_file: - -open_recordio_file ------------------- - -.. autofunction:: paddle.fluid.layers.open_recordio_file - :noindex: - -.. _api_fluid_layers_open_files: - -open_files ----------- - -.. autofunction:: paddle.fluid.layers.open_files - :noindex: - -.. _api_fluid_layers_read_file: - -read_file ---------- - -.. autofunction:: paddle.fluid.layers.read_file - :noindex: - -.. _api_fluid_layers_shuffle: - -shuffle -------- - -.. autofunction:: paddle.fluid.layers.shuffle - :noindex: - -.. _api_fluid_layers_batch: - -batch ------ - -.. autofunction:: paddle.fluid.layers.batch - :noindex: - -.. _api_fluid_layers_double_buffer: - -double_buffer -------------- - -.. autofunction:: paddle.fluid.layers.double_buffer - :noindex: - -.. _api_fluid_layers_random_data_generator: - -random_data_generator ---------------------- - -.. autofunction:: paddle.fluid.layers.random_data_generator - :noindex: - -.. _api_fluid_layers_Preprocessor: - -Preprocessor ------------- - -.. autoclass:: paddle.fluid.layers.Preprocessor - :members: - :noindex: - -.. _api_fluid_layers_load: - -load ----- - -.. autofunction:: paddle.fluid.layers.load - :noindex: - -nn -== - -.. _api_fluid_layers_fc: - -fc --- - -.. autofunction:: paddle.fluid.layers.fc - :noindex: - -.. _api_fluid_layers_embedding: - -embedding ---------- - -.. autofunction:: paddle.fluid.layers.embedding - :noindex: - -.. _api_fluid_layers_dynamic_lstm: - -dynamic_lstm ------------- - -.. autofunction:: paddle.fluid.layers.dynamic_lstm - :noindex: - -.. _api_fluid_layers_dynamic_lstmp: - -dynamic_lstmp -------------- - -.. autofunction:: paddle.fluid.layers.dynamic_lstmp - :noindex: - -.. _api_fluid_layers_dynamic_gru: - -dynamic_gru ------------ - -.. autofunction:: paddle.fluid.layers.dynamic_gru - :noindex: - -.. _api_fluid_layers_gru_unit: - -gru_unit --------- - -.. autofunction:: paddle.fluid.layers.gru_unit - :noindex: - -.. _api_fluid_layers_linear_chain_crf: - -linear_chain_crf ----------------- - -.. autofunction:: paddle.fluid.layers.linear_chain_crf - :noindex: - -.. _api_fluid_layers_crf_decoding: - -crf_decoding ------------- - -.. autofunction:: paddle.fluid.layers.crf_decoding - :noindex: - -.. _api_fluid_layers_cos_sim: - -cos_sim -------- - -.. autofunction:: paddle.fluid.layers.cos_sim - :noindex: - -.. _api_fluid_layers_cross_entropy: - -cross_entropy -------------- - -.. autofunction:: paddle.fluid.layers.cross_entropy - :noindex: - -.. _api_fluid_layers_square_error_cost: - -square_error_cost ------------------ - -.. autofunction:: paddle.fluid.layers.square_error_cost - :noindex: - -.. _api_fluid_layers_chunk_eval: - -chunk_eval ----------- - -.. autofunction:: paddle.fluid.layers.chunk_eval - :noindex: - -.. _api_fluid_layers_sequence_conv: - -sequence_conv -------------- - -.. autofunction:: paddle.fluid.layers.sequence_conv - :noindex: - -.. _api_fluid_layers_conv2d: - -conv2d ------- - -.. autofunction:: paddle.fluid.layers.conv2d - :noindex: - -.. _api_fluid_layers_conv3d: - -conv3d ------- - -.. autofunction:: paddle.fluid.layers.conv3d - :noindex: - -.. _api_fluid_layers_sequence_pool: - -sequence_pool -------------- - -.. autofunction:: paddle.fluid.layers.sequence_pool - :noindex: - -.. _api_fluid_layers_sequence_softmax: - -sequence_softmax ----------------- - -.. autofunction:: paddle.fluid.layers.sequence_softmax - :noindex: - -.. _api_fluid_layers_softmax: - -softmax -------- - -.. autofunction:: paddle.fluid.layers.softmax - :noindex: - -.. _api_fluid_layers_pool2d: - -pool2d ------- - -.. autofunction:: paddle.fluid.layers.pool2d - :noindex: - -.. _api_fluid_layers_pool3d: - -pool3d ------- - -.. autofunction:: paddle.fluid.layers.pool3d - :noindex: - -.. _api_fluid_layers_batch_norm: - -batch_norm ----------- - -.. autofunction:: paddle.fluid.layers.batch_norm - :noindex: - -.. _api_fluid_layers_beam_search_decode: - -beam_search_decode ------------------- - -.. autofunction:: paddle.fluid.layers.beam_search_decode - :noindex: - -.. _api_fluid_layers_conv2d_transpose: - -conv2d_transpose ----------------- - -.. autofunction:: paddle.fluid.layers.conv2d_transpose - :noindex: - -.. _api_fluid_layers_conv3d_transpose: - -conv3d_transpose ----------------- - -.. autofunction:: paddle.fluid.layers.conv3d_transpose - :noindex: - -.. _api_fluid_layers_sequence_expand: - -sequence_expand ---------------- - -.. autofunction:: paddle.fluid.layers.sequence_expand - :noindex: - -.. _api_fluid_layers_lstm_unit: - -lstm_unit ---------- - -.. autofunction:: paddle.fluid.layers.lstm_unit - :noindex: - -.. _api_fluid_layers_reduce_sum: - -reduce_sum ----------- - -.. autofunction:: paddle.fluid.layers.reduce_sum - :noindex: - -.. _api_fluid_layers_reduce_mean: - -reduce_mean ------------ - -.. autofunction:: paddle.fluid.layers.reduce_mean - :noindex: - -.. _api_fluid_layers_reduce_max: - -reduce_max ----------- - -.. autofunction:: paddle.fluid.layers.reduce_max - :noindex: - -.. _api_fluid_layers_reduce_min: - -reduce_min ----------- - -.. autofunction:: paddle.fluid.layers.reduce_min - :noindex: - -.. _api_fluid_layers_reduce_prod: - -reduce_prod ------------ - -.. autofunction:: paddle.fluid.layers.reduce_prod - :noindex: - -.. _api_fluid_layers_sequence_first_step: - -sequence_first_step -------------------- - -.. autofunction:: paddle.fluid.layers.sequence_first_step - :noindex: - -.. _api_fluid_layers_sequence_last_step: - -sequence_last_step ------------------- - -.. autofunction:: paddle.fluid.layers.sequence_last_step - :noindex: - -.. _api_fluid_layers_dropout: - -dropout -------- - -.. autofunction:: paddle.fluid.layers.dropout - :noindex: - -.. _api_fluid_layers_split: - -split ------ - -.. autofunction:: paddle.fluid.layers.split - :noindex: - -.. _api_fluid_layers_ctc_greedy_decoder: - -ctc_greedy_decoder ------------------- - -.. autofunction:: paddle.fluid.layers.ctc_greedy_decoder - :noindex: - -.. _api_fluid_layers_edit_distance: - -edit_distance -------------- - -.. autofunction:: paddle.fluid.layers.edit_distance - :noindex: - -.. _api_fluid_layers_l2_normalize: - -l2_normalize ------------- - -.. autofunction:: paddle.fluid.layers.l2_normalize - :noindex: - -.. _api_fluid_layers_matmul: - -matmul ------- - -.. autofunction:: paddle.fluid.layers.matmul - :noindex: - -.. _api_fluid_layers_topk: - -topk ----- - -.. autofunction:: paddle.fluid.layers.topk - :noindex: - -.. _api_fluid_layers_warpctc: - -warpctc -------- - -.. autofunction:: paddle.fluid.layers.warpctc - :noindex: - -.. _api_fluid_layers_sequence_reshape: - -sequence_reshape ----------------- - -.. autofunction:: paddle.fluid.layers.sequence_reshape - :noindex: - -.. _api_fluid_layers_transpose: - -transpose ---------- - -.. autofunction:: paddle.fluid.layers.transpose - :noindex: - -.. _api_fluid_layers_im2sequence: - -im2sequence ------------ - -.. autofunction:: paddle.fluid.layers.im2sequence - :noindex: - -.. _api_fluid_layers_nce: - -nce ---- - -.. autofunction:: paddle.fluid.layers.nce - :noindex: - -.. _api_fluid_layers_beam_search: - -beam_search ------------ - -.. autofunction:: paddle.fluid.layers.beam_search - :noindex: - -.. _api_fluid_layers_row_conv: - -row_conv --------- - -.. autofunction:: paddle.fluid.layers.row_conv - :noindex: - -.. _api_fluid_layers_multiplex: - -multiplex ---------- - -.. autofunction:: paddle.fluid.layers.multiplex - :noindex: - -.. _api_fluid_layers_layer_norm: - -layer_norm ----------- - -.. autofunction:: paddle.fluid.layers.layer_norm - :noindex: - -.. _api_fluid_layers_softmax_with_cross_entropy: - -softmax_with_cross_entropy --------------------------- - -.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy - :noindex: - -.. _api_fluid_layers_smooth_l1: - -smooth_l1 ---------- - -.. autofunction:: paddle.fluid.layers.smooth_l1 - :noindex: - -.. _api_fluid_layers_one_hot: - -one_hot -------- - -.. autofunction:: paddle.fluid.layers.one_hot - :noindex: - -.. _api_fluid_layers_autoincreased_step_counter: - -autoincreased_step_counter --------------------------- - -.. autofunction:: paddle.fluid.layers.autoincreased_step_counter - :noindex: - -.. _api_fluid_layers_reshape: - -reshape -------- - -.. autofunction:: paddle.fluid.layers.reshape - :noindex: - -.. _api_fluid_layers_lod_reset: - -lod_reset ---------- - -.. autofunction:: paddle.fluid.layers.lod_reset - :noindex: - -.. _api_fluid_layers_lrn: - -lrn ---- - -.. autofunction:: paddle.fluid.layers.lrn - :noindex: - -.. _api_fluid_layers_pad: - -pad ---- - -.. autofunction:: paddle.fluid.layers.pad - :noindex: - -.. _api_fluid_layers_pad_constant_like: - -pad_constant_like ---- - -.. autofunction:: paddle.fluid.layers.pad_constant_like - :noindex: - -.. _api_fluid_layers_label_smooth: - -label_smooth ------------- - -.. autofunction:: paddle.fluid.layers.label_smooth - :noindex: - -.. _api_fluid_layers_roi_pool: - -roi_pool --------- - -.. autofunction:: paddle.fluid.layers.roi_pool - :noindex: - -.. _api_fluid_layers_dice_loss: - -dice_loss ---------- - -.. autofunction:: paddle.fluid.layers.dice_loss - :noindex: - -.. _api_fluid_layers_image_resize: - -image_resize ------------- - -.. autofunction:: paddle.fluid.layers.image_resize - :noindex: - -.. _api_fluid_layers_image_resize_short: - -image_resize_short ------------------- - -.. autofunction:: paddle.fluid.layers.image_resize_short - :noindex: - -.. _api_fluid_layers_resize_bilinear: - -resize_bilinear ---------------- - -.. autofunction:: paddle.fluid.layers.resize_bilinear - :noindex: - -.. _api_fluid_layers_gather: - -gather ------- - -.. autofunction:: paddle.fluid.layers.gather - :noindex: - -.. _api_fluid_layers_random_crop: - -random_crop ------------ - -.. autofunction:: paddle.fluid.layers.random_crop - :noindex: - -.. _api_fluid_layers_mean_iou: - -mean_iou --------- - -.. autofunction:: paddle.fluid.layers.mean_iou - :noindex: - -.. _api_fluid_layers_relu: - -relu ----- - -.. autofunction:: paddle.fluid.layers.relu - :noindex: - -.. _api_fluid_layers_log: - -log ---- - -.. autofunction:: paddle.fluid.layers.log - :noindex: - -.. _api_fluid_layers_crop: - -crop ----- - -.. autofunction:: paddle.fluid.layers.crop - :noindex: - -ops -=== - -.. _api_fluid_layers_mean: - -mean ----- - -.. autofunction:: paddle.fluid.layers.mean - :noindex: - -.. _api_fluid_layers_mul: - -mul ---- - -.. autofunction:: paddle.fluid.layers.mul - :noindex: - -.. _api_fluid_layers_scale: - -scale ------ - -.. autofunction:: paddle.fluid.layers.scale - :noindex: - -.. _api_fluid_layers_sigmoid_cross_entropy_with_logits: - -sigmoid_cross_entropy_with_logits ---------------------------------- - -.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits - :noindex: - -.. _api_fluid_layers_elementwise_add: - -elementwise_add ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_add - :noindex: - -.. _api_fluid_layers_elementwise_div: - -elementwise_div ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_div - :noindex: - -.. _api_fluid_layers_elementwise_sub: - -elementwise_sub ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_sub - :noindex: - -.. _api_fluid_layers_elementwise_mul: - -elementwise_mul ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_mul - :noindex: - -.. _api_fluid_layers_elementwise_max: - -elementwise_max ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_max - :noindex: - -.. _api_fluid_layers_elementwise_min: - -elementwise_min ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_min - :noindex: - -.. _api_fluid_layers_elementwise_pow: - -elementwise_pow ---------------- - -.. autofunction:: paddle.fluid.layers.elementwise_pow - :noindex: - -.. _api_fluid_layers_clip: - -clip ----- - -.. autofunction:: paddle.fluid.layers.clip - :noindex: - -.. _api_fluid_layers_clip_by_norm: - -clip_by_norm ------------- - -.. autofunction:: paddle.fluid.layers.clip_by_norm - :noindex: - -.. _api_fluid_layers_logical_and: - -logical_and ------------ - -.. autofunction:: paddle.fluid.layers.logical_and - :noindex: - -.. _api_fluid_layers_logical_or: - -logical_or ----------- - -.. autofunction:: paddle.fluid.layers.logical_or - :noindex: - -.. _api_fluid_layers_logical_xor: - -logical_xor ------------ - -.. autofunction:: paddle.fluid.layers.logical_xor - :noindex: - -.. _api_fluid_layers_logical_not: - -logical_not ------------ - -.. autofunction:: paddle.fluid.layers.logical_not - :noindex: - -.. _api_fluid_layers_uniform_random_batch_size_like: - -uniform_random_batch_size_like ------------------------------- - -.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like - :noindex: - -.. _api_fluid_layers_gaussian_random: - -gaussian_random ---------------- - -.. autofunction:: paddle.fluid.layers.gaussian_random - :noindex: - -.. _api_fluid_layers_gaussian_random_batch_size_like: - -gaussian_random_batch_size_like -------------------------------- - -.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like - :noindex: - -.. _api_fluid_layers_scatter: - -scatter -------- - -.. autofunction:: paddle.fluid.layers.scatter - :noindex: - -.. _api_fluid_layers_sum: - -sum ---- - -.. autofunction:: paddle.fluid.layers.sum - :noindex: - -.. _api_fluid_layers_slice: - -slice ------ - -.. autofunction:: paddle.fluid.layers.slice - :noindex: - -.. _api_fluid_layers_polygon_box_transform: - -polygon_box_transform ---------------------- - -.. autofunction:: paddle.fluid.layers.polygon_box_transform - :noindex: - -.. _api_fluid_layers_shape: - -shape ------ - -.. autofunction:: paddle.fluid.layers.shape - :noindex: - -.. _api_fluid_layers_iou_similarity: - -iou_similarity --------------- - -.. autofunction:: paddle.fluid.layers.iou_similarity - :noindex: - -.. _api_fluid_layers_maxout: - -maxout ------- - -.. autofunction:: paddle.fluid.layers.maxout - :noindex: - -.. _api_fluid_layers_sigmoid: - -sigmoid -------- - -.. autofunction:: paddle.fluid.layers.sigmoid - :noindex: - -.. _api_fluid_layers_hsigmoid: - -hsigmoid -------- - -.. autofunction:: paddle.fluid.layers.hsigmoid - :noindex: - -.. _api_fluid_layers_logsigmoid: - -logsigmoid ----------- - -.. autofunction:: paddle.fluid.layers.logsigmoid - :noindex: - -.. _api_fluid_layers_exp: - -exp ---- - -.. autofunction:: paddle.fluid.layers.exp - :noindex: - -.. _api_fluid_layers_tanh: - -tanh ----- - -.. autofunction:: paddle.fluid.layers.tanh - :noindex: - -.. _api_fluid_layers_tanh_shrink: - -tanh_shrink ------------ - -.. autofunction:: paddle.fluid.layers.tanh_shrink - :noindex: - -.. _api_fluid_layers_softshrink: - -softshrink ----------- - -.. autofunction:: paddle.fluid.layers.softshrink - :noindex: - -.. _api_fluid_layers_sqrt: - -sqrt ----- - -.. autofunction:: paddle.fluid.layers.sqrt - :noindex: - -.. _api_fluid_layers_abs: - -abs ---- - -.. autofunction:: paddle.fluid.layers.abs - :noindex: - -.. _api_fluid_layers_ceil: - -ceil ----- - -.. autofunction:: paddle.fluid.layers.ceil - :noindex: - -.. _api_fluid_layers_floor: - -floor ------ - -.. autofunction:: paddle.fluid.layers.floor - :noindex: - -.. _api_fluid_layers_cos: - -cos ---- - -.. autofunction:: paddle.fluid.layers.cos - :noindex: - -.. _api_fluid_layers_sin: - -sin ---- - -.. autofunction:: paddle.fluid.layers.sin - :noindex: - -.. _api_fluid_layers_round: - -round ------ - -.. autofunction:: paddle.fluid.layers.round - :noindex: - -.. _api_fluid_layers_reciprocal: - -reciprocal ----------- - -.. autofunction:: paddle.fluid.layers.reciprocal - :noindex: - -.. _api_fluid_layers_square: - -square ------- - -.. autofunction:: paddle.fluid.layers.square - :noindex: - -.. _api_fluid_layers_softplus: - -softplus --------- - -.. autofunction:: paddle.fluid.layers.softplus - :noindex: - -.. _api_fluid_layers_softsign: - -softsign --------- - -.. autofunction:: paddle.fluid.layers.softsign - :noindex: - -.. _api_fluid_layers_brelu: - -brelu ------ - -.. autofunction:: paddle.fluid.layers.brelu - :noindex: - -.. _api_fluid_layers_leaky_relu: - -leaky_relu ----------- - -.. autofunction:: paddle.fluid.layers.leaky_relu - :noindex: - -.. _api_fluid_layers_soft_relu: - -soft_relu ---------- - -.. autofunction:: paddle.fluid.layers.soft_relu - :noindex: - -.. _api_fluid_layers_elu: - -elu ---- - -.. autofunction:: paddle.fluid.layers.elu - :noindex: - -.. _api_fluid_layers_relu6: - -relu6 ------ - -.. autofunction:: paddle.fluid.layers.relu6 - :noindex: - -.. _api_fluid_layers_pow: - -pow ---- - -.. autofunction:: paddle.fluid.layers.pow - :noindex: - -.. _api_fluid_layers_stanh: - -stanh ------ - -.. autofunction:: paddle.fluid.layers.stanh - :noindex: - -.. _api_fluid_layers_hard_sigmoid: - -hard_sigmoid ------------- - -.. autofunction:: paddle.fluid.layers.hard_sigmoid - :noindex: - -.. _api_fluid_layers_swish: - -swish ------ - -.. autofunction:: paddle.fluid.layers.swish - :noindex: - -.. _api_fluid_layers_uniform_random: - -uniform_random --------------- - -.. autofunction:: paddle.fluid.layers.uniform_random - :noindex: - -.. _api_fluid_layers_hard_shrink: - -hard_shrink ------------ - -.. autofunction:: paddle.fluid.layers.hard_shrink - :noindex: - -.. _api_fluid_layers_cumsum: - -cumsum ------- - -.. autofunction:: paddle.fluid.layers.cumsum - :noindex: - -.. _api_fluid_layers_thresholded_relu: - -thresholded_relu ----------------- - -.. autofunction:: paddle.fluid.layers.thresholded_relu - :noindex: - -tensor -====== - -.. _api_fluid_layers_create_tensor: - -create_tensor -------------- - -.. autofunction:: paddle.fluid.layers.create_tensor - :noindex: - -.. _api_fluid_layers_create_parameter: - -create_parameter ----------------- - -.. autofunction:: paddle.fluid.layers.create_parameter - :noindex: - -.. _api_fluid_layers_create_global_var: - -create_global_var ------------------ - -.. autofunction:: paddle.fluid.layers.create_global_var - :noindex: - -.. _api_fluid_layers_cast: - -cast ----- - -.. autofunction:: paddle.fluid.layers.cast - :noindex: - -.. _api_fluid_layers_concat: - -concat ------- - -.. autofunction:: paddle.fluid.layers.concat - :noindex: - -.. _api_fluid_layers_sums: - -sums ----- - -.. autofunction:: paddle.fluid.layers.sums - :noindex: - -.. _api_fluid_layers_assign: - -assign ------- - -.. autofunction:: paddle.fluid.layers.assign - :noindex: - -.. _api_fluid_layers_fill_constant_batch_size_like: - -fill_constant_batch_size_like ------------------------------ - -.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like - :noindex: - -.. _api_fluid_layers_fill_constant: - -fill_constant -------------- - -.. autofunction:: paddle.fluid.layers.fill_constant - :noindex: - -.. _api_fluid_layers_argmin: - -argmin ------- - -.. autofunction:: paddle.fluid.layers.argmin - :noindex: - -.. _api_fluid_layers_argmax: - -argmax ------- - -.. autofunction:: paddle.fluid.layers.argmax - :noindex: - -.. _api_fluid_layers_argsort: - -argsort -------- - -.. autofunction:: paddle.fluid.layers.argsort - :noindex: - -.. _api_fluid_layers_ones: - -ones ----- - -.. autofunction:: paddle.fluid.layers.ones - :noindex: - -.. _api_fluid_layers_zeros: - -zeros ------ - -.. autofunction:: paddle.fluid.layers.zeros - :noindex: - -.. _api_fluid_layers_reverse: - -reverse -------- - -.. autofunction:: paddle.fluid.layers.reverse - :noindex: - -learning_rate_scheduler -======================= - -.. _api_fluid_layers_exponential_decay: - -exponential_decay ------------------ - -.. autofunction:: paddle.fluid.layers.exponential_decay - :noindex: - -.. _api_fluid_layers_natural_exp_decay: - -natural_exp_decay ------------------ - -.. autofunction:: paddle.fluid.layers.natural_exp_decay - :noindex: - -.. _api_fluid_layers_inverse_time_decay: - -inverse_time_decay ------------------- - -.. autofunction:: paddle.fluid.layers.inverse_time_decay - :noindex: - -.. _api_fluid_layers_polynomial_decay: - -polynomial_decay ----------------- - -.. autofunction:: paddle.fluid.layers.polynomial_decay - :noindex: - -.. _api_fluid_layers_piecewise_decay: - -piecewise_decay ---------------- - -.. autofunction:: paddle.fluid.layers.piecewise_decay - :noindex: - -.. _api_fluid_layers_noam_decay: - -noam_decay ----------- - -.. autofunction:: paddle.fluid.layers.noam_decay - :noindex: - -.. _api_fluid_layers_append_LARS: - -append_LARS ------------ - -.. autofunction:: paddle.fluid.layers.append_LARS - :noindex: - -detection -========= - -.. _api_fluid_layers_prior_box: - -prior_box ---------- - -.. autofunction:: paddle.fluid.layers.prior_box - :noindex: - -.. _api_fluid_layers_multi_box_head: - -multi_box_head --------------- - -.. autofunction:: paddle.fluid.layers.multi_box_head - :noindex: - -.. _api_fluid_layers_bipartite_match: - -bipartite_match ---------------- - -.. autofunction:: paddle.fluid.layers.bipartite_match - :noindex: - -.. _api_fluid_layers_target_assign: - -target_assign -------------- - -.. autofunction:: paddle.fluid.layers.target_assign - :noindex: - -.. _api_fluid_layers_detection_output: - -detection_output ----------------- - -.. autofunction:: paddle.fluid.layers.detection_output - :noindex: - -.. _api_fluid_layers_ssd_loss: - -ssd_loss --------- - -.. autofunction:: paddle.fluid.layers.ssd_loss - :noindex: - -.. _api_fluid_layers_detection_map: - -detection_map -------------- - -.. autofunction:: paddle.fluid.layers.detection_map - :noindex: - -.. _api_fluid_layers_iou_similarity: - -iou_similarity --------------- - -.. autofunction:: paddle.fluid.layers.iou_similarity - :noindex: - -.. _api_fluid_layers_box_coder: - -box_coder ---------- - -.. autofunction:: paddle.fluid.layers.box_coder - :noindex: - -metric_op -========= - -.. _api_fluid_layers_accuracy: - -accuracy --------- - -.. autofunction:: paddle.fluid.layers.accuracy - :noindex: - -.. _api_fluid_layers_auc: - -auc ---- - -.. autofunction:: paddle.fluid.layers.auc - :noindex: - -tensor -====== - -.. _api_fluid_layers_create_tensor: - -create_tensor -------------- - -.. autofunction:: paddle.fluid.layers.create_tensor - :noindex: - -.. _api_fluid_layers_create_parameter: - -create_parameter ----------------- - -.. autofunction:: paddle.fluid.layers.create_parameter - :noindex: - -.. _api_fluid_layers_create_global_var: - -create_global_var ------------------ - -.. autofunction:: paddle.fluid.layers.create_global_var - :noindex: - -.. _api_fluid_layers_cast: - -cast ----- - -.. autofunction:: paddle.fluid.layers.cast - :noindex: - -.. _api_fluid_layers_concat: - -concat ------- - -.. autofunction:: paddle.fluid.layers.concat - :noindex: - -.. _api_fluid_layers_sums: - -sums ----- - -.. autofunction:: paddle.fluid.layers.sums - :noindex: - -.. _api_fluid_layers_assign: - -assign ------- - -.. autofunction:: paddle.fluid.layers.assign - :noindex: - -.. _api_fluid_layers_fill_constant_batch_size_like: - -fill_constant_batch_size_like ------------------------------ - -.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like - :noindex: - -.. _api_fluid_layers_fill_constant: - -fill_constant -------------- - -.. autofunction:: paddle.fluid.layers.fill_constant - :noindex: - -.. _api_fluid_layers_argmin: - -argmin ------- - -.. autofunction:: paddle.fluid.layers.argmin - :noindex: - -.. _api_fluid_layers_argmax: - -argmax ------- - -.. autofunction:: paddle.fluid.layers.argmax - :noindex: - -.. _api_fluid_layers_ones: - -ones ----- - -.. autofunction:: paddle.fluid.layers.ones - :noindex: - -.. _api_fluid_layers_zeros: - -zeros ------ - -.. autofunction:: paddle.fluid.layers.zeros - :noindex: - -.. _api_fluid_layers_reverse: - -reverse -------- - -.. autofunction:: paddle.fluid.layers.reverse - :noindex: - -.. _api_fluid_layers_rank_loss: - -rank_loss -------- - -.. autofunction:: paddle.fluid.layers.rank_loss - :noindex: - diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst deleted file mode 100644 index 0f54b2e2eb7ead353215c5dbd529293794e37123..0000000000000000000000000000000000000000 --- a/doc/fluid/api/metrics.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============= -fluid.metrics -============= - -.. _api_fluid_metrics_MetricBase: - -MetricBase ----------- - -.. autoclass:: paddle.fluid.metrics.MetricBase - :members: - :noindex: - -.. _api_fluid_metrics_CompositeMetric: - -CompositeMetric ---------------- - -.. autoclass:: paddle.fluid.metrics.CompositeMetric - :members: - :noindex: - -.. _api_fluid_metrics_Precision: - -Precision ---------- - -.. autoclass:: paddle.fluid.metrics.Precision - :members: - :noindex: - -.. _api_fluid_metrics_Recall: - -Recall ------- - -.. autoclass:: paddle.fluid.metrics.Recall - :members: - :noindex: - -.. _api_fluid_metrics_Accuracy: - -Accuracy --------- - -.. autoclass:: paddle.fluid.metrics.Accuracy - :members: - :noindex: - -.. _api_fluid_metrics_ChunkEvaluator: - -ChunkEvaluator --------------- - -.. autoclass:: paddle.fluid.metrics.ChunkEvaluator - :members: - :noindex: - -.. _api_fluid_metrics_EditDistance: - -EditDistance ------------- - -.. autoclass:: paddle.fluid.metrics.EditDistance - :members: - :noindex: - -.. _api_fluid_metrics_DetectionMAP: - -DetectionMAP ------------- - -.. autoclass:: paddle.fluid.metrics.DetectionMAP - :members: - :noindex: - -.. _api_fluid_metrics_Auc: - -Auc ---- - -.. autoclass:: paddle.fluid.metrics.Auc - :members: - :noindex: - diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst deleted file mode 100644 index 059733af18517257b6821d95fd628a9e13e6e98e..0000000000000000000000000000000000000000 --- a/doc/fluid/api/nets.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -========== -fluid.nets -========== - -.. _api_fluid_nets_simple_img_conv_pool: - -simple_img_conv_pool --------------------- - -.. autofunction:: paddle.fluid.nets.simple_img_conv_pool - :noindex: - -.. _api_fluid_nets_sequence_conv_pool: - -sequence_conv_pool ------------------- - -.. autofunction:: paddle.fluid.nets.sequence_conv_pool - :noindex: - -.. _api_fluid_nets_glu: - -glu ---- - -.. autofunction:: paddle.fluid.nets.glu - :noindex: - -.. _api_fluid_nets_scaled_dot_product_attention: - -scaled_dot_product_attention ----------------------------- - -.. autofunction:: paddle.fluid.nets.scaled_dot_product_attention - :noindex: - diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst deleted file mode 100644 index 8d792120f2f16a8c92606b343eb4c3d4368bed14..0000000000000000000000000000000000000000 --- a/doc/fluid/api/optimizer.rst +++ /dev/null @@ -1,178 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -=============== -fluid.optimizer -=============== - -.. _api_fluid_optimizer_SGD: - -SGD ---- - -.. autoclass:: paddle.fluid.optimizer.SGD - :members: - :noindex: - -.. _api_fluid_optimizer_Momentum: - -Momentum --------- - -.. autoclass:: paddle.fluid.optimizer.Momentum - :members: - :noindex: - -.. _api_fluid_optimizer_Adagrad: - -Adagrad -------- - -.. autoclass:: paddle.fluid.optimizer.Adagrad - :members: - :noindex: - -.. _api_fluid_optimizer_Adam: - -Adam ----- - -.. autoclass:: paddle.fluid.optimizer.Adam - :members: - :noindex: - -.. _api_fluid_optimizer_Adamax: - -Adamax ------- - -.. autoclass:: paddle.fluid.optimizer.Adamax - :members: - :noindex: - -.. _api_fluid_optimizer_DecayedAdagrad: - -DecayedAdagrad --------------- - -.. autoclass:: paddle.fluid.optimizer.DecayedAdagrad - :members: - :noindex: - -.. _api_fluid_optimizer_Ftrl: - -Ftrl ----- - -.. autoclass:: paddle.fluid.optimizer.Ftrl - :members: - :noindex: - -.. _api_fluid_optimizer_SGDOptimizer: - -SGDOptimizer ------------- - -.. autoclass:: paddle.fluid.optimizer.SGDOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_MomentumOptimizer: - -MomentumOptimizer ------------------ - -.. autoclass:: paddle.fluid.optimizer.MomentumOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_AdagradOptimizer: - -AdagradOptimizer ----------------- - -.. autoclass:: paddle.fluid.optimizer.AdagradOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_AdamOptimizer: - -AdamOptimizer -------------- - -.. autoclass:: paddle.fluid.optimizer.AdamOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_AdamaxOptimizer: - -AdamaxOptimizer ---------------- - -.. autoclass:: paddle.fluid.optimizer.AdamaxOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_DecayedAdagradOptimizer: - -DecayedAdagradOptimizer ------------------------ - -.. autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_RMSPropOptimizer: - -RMSPropOptimizer ----------------- - -.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_FtrlOptimizer: - -FtrlOptimizer -------------- - -.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer - :members: - :noindex: - -.. _api_fluid_optimizer_Adadelta: - -Adadelta --------- - -.. autoclass:: paddle.fluid.optimizer.Adadelta - :members: - :noindex: - -.. _api_fluid_optimizer_ModelAverage: - -ModelAverage ------------- - -.. autoclass:: paddle.fluid.optimizer.ModelAverage - :members: - :noindex: - -.. _api_fluid_optimizer_Optimizer: - -Optimizer ---------- - -.. autoclass:: paddle.fluid.optimizer.Optimizer - :members: - :noindex: - -.. _api_fluid_optimizer_RMSPropOptimizer: - -RMSPropOptimizer ----------------- - -.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer - :members: - :noindex: - diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst deleted file mode 100644 index 33035bbc7ca5c8d000adeaf1cb79806a3ea64604..0000000000000000000000000000000000000000 --- a/doc/fluid/api/param_attr.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -================ -fluid.param_attr -================ - -.. _api_fluid_param_attr_ParamAttr: - -ParamAttr ---------- - -.. autoclass:: paddle.fluid.param_attr.ParamAttr - :members: - :noindex: - -.. _api_fluid_param_attr_WeightNormParamAttr: - -WeightNormParamAttr -------------------- - -.. autoclass:: paddle.fluid.param_attr.WeightNormParamAttr - :members: - :noindex: - diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst deleted file mode 100644 index c750a2d588df56728ac7f73051ab7a9e44dee232..0000000000000000000000000000000000000000 --- a/doc/fluid/api/profiler.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -============== -fluid.profiler -============== - -.. _api_fluid_profiler_cuda_profiler: - -cuda_profiler -------------- - -.. autofunction:: paddle.fluid.profiler.cuda_profiler - :noindex: - -.. _api_fluid_profiler_reset_profiler: - -reset_profiler --------------- - -.. autofunction:: paddle.fluid.profiler.reset_profiler - :noindex: - -.. _api_fluid_profiler_profiler: - -profiler --------- - -.. autofunction:: paddle.fluid.profiler.profiler - :noindex: - -.. _api_fluid_profiler_start_profiler: - -start_profiler --------------- - -.. autofunction:: paddle.fluid.profiler.start_profiler - :noindex: - -.. _api_fluid_profiler_stop_profiler: - -stop_profiler -------------- - -.. autofunction:: paddle.fluid.profiler.stop_profiler - :noindex: - diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst deleted file mode 100644 index f0c12fd115478a29fbd178b533b7490b2f663717..0000000000000000000000000000000000000000 --- a/doc/fluid/api/recordio_writer.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -===================== -fluid.recordio_writer -===================== - -.. _api_fluid_recordio_writer_convert_reader_to_recordio_file: - -convert_reader_to_recordio_file -------------------------------- - -.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file - :noindex: - -.. _api_fluid_recordio_writer_convert_reader_to_recordio_files: - -convert_reader_to_recordio_files --------------------------------- - -.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files - :noindex: - diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst deleted file mode 100644 index 987eaea903520d91c284c8da7a8cb066a1648069..0000000000000000000000000000000000000000 --- a/doc/fluid/api/regularizer.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -================= -fluid.regularizer -================= - -.. _api_fluid_regularizer_append_regularization_ops: - -append_regularization_ops -------------------------- - -.. autofunction:: paddle.fluid.regularizer.append_regularization_ops - :noindex: - -.. _api_fluid_regularizer_L1Decay: - -L1Decay -------- - -.. autoclass:: paddle.fluid.regularizer.L1Decay - :members: - :noindex: - -.. _api_fluid_regularizer_L2Decay: - -L2Decay -------- - -.. autoclass:: paddle.fluid.regularizer.L2Decay - :members: - :noindex: - -.. _api_fluid_regularizer_L1DecayRegularizer: - -L1DecayRegularizer ------------------- - -.. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer - :members: - :noindex: - -.. _api_fluid_regularizer_L2DecayRegularizer: - -L2DecayRegularizer ------------------- - -.. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer - :members: - :noindex: - diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst deleted file mode 100644 index d2ac04f1449c32cb414cea1b76d7469bbe9ccb85..0000000000000000000000000000000000000000 --- a/doc/fluid/api/transpiler.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` - !DO NOT EDIT THIS FILE MANUALLY! - -================ -fluid.transpiler -================ - -.. _api_fluid_transpiler_DistributeTranspiler: - -DistributeTranspiler --------------------- - -.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler - :members: - :noindex: - -.. _api_fluid_transpiler_InferenceTranspiler: - -InferenceTranspiler -------------------- - -.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler - :members: - :noindex: - -.. _api_fluid_transpiler_memory_optimize: - -memory_optimize ---------------- - -.. autofunction:: paddle.fluid.transpiler.memory_optimize - :noindex: - -.. _api_fluid_transpiler_release_memory: - -release_memory --------------- - -.. autofunction:: paddle.fluid.transpiler.release_memory - :noindex: - -.. _api_fluid_transpiler_HashName: - -HashName --------- - -.. autoclass:: paddle.fluid.transpiler.HashName - :members: - :noindex: - -.. _api_fluid_transpiler_RoundRobin: - -RoundRobin ----------- - -.. autoclass:: paddle.fluid.transpiler.RoundRobin - :members: - :noindex: - diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst deleted file mode 120000 index ae4e8c7c48e584ec16a7be5466f83dd154ffb5fb..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/build_from_source_cn.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/build_from_source_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst deleted file mode 120000 index 1ac828c973826bb8374c4aa8e17fda3ea1bb939f..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/build_from_source_en.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/build_from_source_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst deleted file mode 120000 index 965b2e20559291989422938c418fadbac16941b9..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/docker_install_cn.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/docker_install_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst deleted file mode 120000 index 79d7341a7bbb9e477c773134f24983fd7607769a..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/docker_install_en.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/docker_install_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst deleted file mode 120000 index f697fcd8fac9131862ae7f8f51c5ebe93737ad2d..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/index_cn.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/index_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst deleted file mode 120000 index 502f66a41319d4f41ae1774628ca36da9dca76ce..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/index_en.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/index_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png deleted file mode 120000 index c3eb1457acc77cab9360e654240d1e8f548035b4..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/paddleci.png +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/paddleci.png \ No newline at end of file diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst deleted file mode 120000 index 07deca84b82ff553e0c19324695089dcfb6be90e..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/pip_install_cn.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/pip_install_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst deleted file mode 120000 index 7f39c998195b719b05443e96f1c4a6a8d44b98c9..0000000000000000000000000000000000000000 --- a/doc/fluid/build_and_install/pip_install_en.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/build_and_install/pip_install_en.rst \ No newline at end of file diff --git a/doc/fluid/design/algorithm/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif deleted file mode 100644 index 4a0da7bf6df9326a2aab1638b77c5455c18b8c4e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/algorithm/images/asgd.gif and /dev/null differ diff --git a/doc/fluid/design/algorithm/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif deleted file mode 100644 index dd24d33e124396be3fc410c9b12f33148f64efe2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/algorithm/images/theta_star.gif and /dev/null differ diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst deleted file mode 100644 index 0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9..0000000000000000000000000000000000000000 --- a/doc/fluid/design/algorithm/index_cn.rst +++ /dev/null @@ -1,7 +0,0 @@ -梯度更新算法 ------------- - -.. toctree:: - :maxdepth: 1 - - parameter_average.md diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst deleted file mode 100644 index 59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc..0000000000000000000000000000000000000000 --- a/doc/fluid/design/algorithm/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -Gradient Update Algorithm --------------------------------------- - -.. toctree:: - :maxdepth: 1 - - parameter_average.md diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md deleted file mode 100644 index 28ad6495d97515442eb8af2050158829814acd33..0000000000000000000000000000000000000000 --- a/doc/fluid/design/algorithm/parameter_average.md +++ /dev/null @@ -1,74 +0,0 @@ -# Averaging Parameter in PaddlePaddle - -## Why Averaging -In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible. - -Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. - -Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: - -

-
-

- -We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. - -### How to perform Parameter Averaging in PaddlePaddle - -Parameter Averaging in PaddlePaddle works in the following way during training : -1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer -2. The optimizer itself is responsible for updating the parameters. -3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: - 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches. - 2. However, saving all N instances of the parameters in memory is not feasible. - 3. Therefore, an approximation algorithm is used. - -Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. - -During the testing/saving the model phase, we perform the following steps: -1. Perform the delayed operations. -2. Save current values of the parameters to a temporary variable. -3. Replace the values of the parameters with the averaged values. -4. Perform testing and/or save the parameters. -5. Restore the values of the parameters once done. - -### How to implement Averaging of Parameter in PaddlePaddle - -We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training. - - **Advantages**: - - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op. - - Makes it easy for the users to customize and extend the framework. - - **Disadvantages**: - - Implementation requires re-writing the averaging methodology in Python. - -### Low-Level implementation - -In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input: -- the optimizer -- the window_size to keep the updates - -The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU. - -The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API. - -### Python API implementation for ParameterAverageOptimizer - -Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following: -- Any optimizer (RMSProp , AdaGrad etc.) -- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision. - -Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions. -We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc) - -#### Creation of the ParameterAverageOptimizer operator -There are two ways for creating the ParameterAverageOptimizer op: -1. We create the op immediately while building the computation graph. -2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added. - -The proposal is to add the op immediately while building the computation graph. - -#### High-level API - -In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions. diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md deleted file mode 100644 index 8ded0ad22f4013a521bf3bee260565dc5cf855ae..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/README.md +++ /dev/null @@ -1,174 +0,0 @@ -A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++. - -Here are some initial thoughts. Your comments are welcome! - -# Required CMake Function - -I think we need only the following few CMake functions to make a project description mean and clean: - - - - - - - - - - - - - - - - - - - - - - - - - - -
C++CUDA C++Go
cc_library nv_library go_library
cc_binary nv_binary go_binary
cc_test nv_test go_test
- - -- The `_library` functions generate .a files from source code. -- The `_binary` functions generate executable binary files. -- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`. - -The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler. - -Both `nv_` and `cc_` functions enables C++11 (-std=c++11). - -Also, - -- to describe external dependencies, we need `external_library`. -- to build shared libraries, we need `shared_library`. - -## An Example Project - -Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: - -- tensor.h -- tensor.cc -- tensor_test.cc -- ops.h -- ops.cu -- ops_test.cu -- api.go -- api_test.go - -Suppose that ops.cu depends on CUDNN. - -```cmake -# cc_binary parses tensor.cc and figures out that target also depend -# on tensor.h. -cc_binary(tensor - SRCS - tensor.cc) - -# The dependency to target tensor implies that if any of -# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. -cc_test(tensor_test - SRCS - tensor_test.cc - DEPS - tensor) - -# I don't have a clear idea what parameters external_library need to -# have. @gangliao as a CMake expert would have better ideas. -external_library(cudnn - ....) - -# Suppose that ops.cu depends on external target CUDNN. Also, ops.cu -# include global functions that take Tensor as their parameters, so -# ops depend on tensor. This implies that if any of tensor.{h.cc}, -# ops.{h,cu} is changed, ops need to be re-built. -nv_library(ops - SRCS - ops.cu - DEPS - tensor - cudnn) # cudnn is defined later. - -nv_test(ops_test - SRCS - ops_test.cu - DEPS - ops) - -# Because api.go defines a GO wrapper to ops and tensor, it depends on -# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or -# api.go is changed, api need to be re-built. -go_library(api - SRCS - api.go - DEPS - tensor # Because ops depend on tensor, this line is optional. - ops) - -go_test(api_test - SRCS - api_test.go - DEPS - api) - - -# This builds libapi.so. shared_library might use CMake target -# api_shared so to distinguish it from above target api. -shared_library(api - DEPS - api) - -``` - -## Implementation - -As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. - -## Using Package Manager For Go - -Building Go binaries and libraries need to satisfy their dependencies, generally -we can do `go get ./...` to download and compile all external dependencies. The -problems are: - -1. `go get` will always get the latest code from the default branch of the - remote repo, so changes of dependents might break the build. This is very - different with what we already have in `cmake/external` which download a - specific version or commit id of the dependency. -1. Some locations can not access external dependencies through the internet, as mentioned - in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management - tools can package the dependencies as a "vendor" package, which can be mirrored - at many cloud file hosting, so users what to compile paddle by themselves can - download this "vendor" package from a mirror site. - -### Choose A Suitable Tool - -As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) -list dozens of Go package managers. We choose the tool using following principles: - -- Most "active" projects with more stars, more pull requests or commits -- Widely used project - -After comparing all these projects, we shall choose between the most popular -tools: Godep and Glide. - -Here's a brief comparison between Godep and Glide -: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are -also many complaints about using `Godep`. There's also a new "official" pakcage -management tool has been started at: https://github.com/golang/dep to resolve -such problems, but it's currently at Alpha stage. So the best choice now is -glide obviously. - -### Manage Go Packages - -- Dependencies: `go/glide.yaml` will store the dependencies and their versions which - is directly imported by paddle. `go/glide.lock` will store all dependencies recursively - with their commit id. Builds will "lock" to these packages if we don't `glide up` - them -- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake` - will download the code corresponding to `go/glide.lock`. If we put a vendor folder - under `go/`, cmake will just check the commit id to the packages under the folder, - if commit id matches, there will be no download at all. diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md deleted file mode 100644 index 3757cd055c818be1e63ee8c0f000f4dd299b59f4..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/block.md +++ /dev/null @@ -1,375 +0,0 @@ -# Design Doc: Block and Scope - -## The Representation of Computation - -Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: - -- Caffe, Torch, and Paddle: sequences of layers. -- TensorFlow, Caffe2, Mxnet: graph of operators. -- PaddlePaddle: nested blocks, like C++ and Java programs. - -## Block in Programming Languages and Deep Learning - -In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. - -Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: - - - - - - - - - - - - - - - - - - - - - - -
programming languagesPaddlePaddle
for, while loop RNN, WhileOp
if, if-else, switch IfElseOp, SwitchOp
sequential execution a sequence of layers
- - -A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes. - -## Stack Frames and the Scope Hierarchy - -The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: - - - - - - - - - - - - - - - - - - - - - - - - - - -
programming languagesPaddlePaddle
stack scope hierarchy
stack frame scope
push at entering block push at entering block
pop at leaving block destroy when minibatch completes
- - -1. In traditional programs: - - - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables. - - After the execution leaves the right curly brace, the runtime pops the frame. - - The maximum number of frames in the stack is the maximum depth of nested blocks. - -1. In PaddlePaddle - - - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. - - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. - - The height of the highest tree is the maximum depth of nested blocks. - - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. - -## Use Blocks in C++ and PaddlePaddle Programs - -Let us consolidate the discussion by presenting some examples. - -### Blocks with `if-else` and `IfElseOp` - -The following C++ programs shows how blocks are used with the `if-else` structure: - -```c++ -namespace pd = paddle; - -int x = 10; -int y = 1; -int z = 10; -bool cond = false; -int o1, o2; -if (cond) { - int z = x + y; - o1 = z; - o2 = pd::layer::softmax(z); -} else { - int d = pd::layer::fc(z); - o1 = d; - o2 = d+1; -} - -``` - -An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows: - -```python -import paddle as pd - -x = minibatch([10, 20, 30]) # shape=[None, 1] -y = var(1) # shape=[1], value=1 -z = minibatch([10, 20, 30]) # shape=[None, 1] -cond = larger_than(x, 15) # [false, true, true] - -ie = pd.ifelse() -with ie.true_block(): - d = pd.layer.add_scalar(x, y) - ie.output(d, pd.layer.softmax(d)) -with ie.false_block(): - d = pd.layer.fc(z) - ie.output(d, d+1) -o1, o2 = ie(cond) -``` - -In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . - -The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. - - -### Blocks with `for` and `RNNOp` - -The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) : - -```python -x = sequence([10, 20, 30]) # shape=[None, 1] -m = var(0) # shape=[1] -W = var(0.314, param=true) # shape=[1] -U = var(0.375, param=true) # shape=[1] - -rnn = pd.rnn() -with rnn.step(): - h = rnn.memory(init = m) - h_prev = rnn.previous_memory(h) - a = layer.fc(W, x) - b = layer.fc(U, h_prev) - s = pd.add(a, b) - act = pd.sigmoid(s) - rnn.update_memory(h, act) - rnn.output(a, b) -o1, o2 = rnn() -``` -has its equivalent C++ program as follows - -```c++ -int* x = {10, 20, 30}; -int* m = {0}; -int* W = {0.314}; -int* U = {0.375}; - -int mem[sizeof(x) / sizeof(x[0]) + 1]; -int o1[sizeof(x) / sizeof(x[0]) + 1]; -int o2[sizeof(x) / sizeof(x[0]) + 1]; -for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { - int x = x[i-1]; - if (i == 1) mem[0] = m; - int a = W * x; - int b = Y * mem[i-1]; - int s = fc_out + hidden_out; - int act = sigmoid(sum); - mem[i] = act; - o1[i] = act; - o2[i] = hidden_out; -} -``` - -## Compilation and Execution - -Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. - -The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. - -## The "Binary Executable File Format" - -The definition of the protobuf message is as follows: - -```protobuf -message BlockDesc { - repeated VarDesc vars = 1; - repeated OpDesc ops = 2; -} -``` - -The step net in above RNN example would look like - -``` -BlockDesc { - vars = { - VarDesc {...} // x - VarDesc {...} // h - VarDesc {...} // fc_out - VarDesc {...} // hidden_out - VarDesc {...} // sum - VarDesc {...} // act - } - ops = { - OpDesc {...} // matmul - OpDesc {...} // add_two - OpDesc {...} // sigmoid - } -}; -``` - -Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like: - -``` -OpDesc { - inputs = {0} // the index of x in vars of BlockDesc above - outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above - attrs { - "states" : {1} // the index of h - "step_net" : - } -}; -``` - -This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block. - - -## The Compilation of Blocks - -During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). - -VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope. -Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example: - -```python -a = pd.Variable(shape=[20, 20]) -b = pd.fc(a, params=["fc.w", "fc.b"]) - -rnn = pd.create_rnn() -with rnn.stepnet(): - x = a.as_step_input() - # reuse fc's parameter - fc_without_b = pd.get_variable("fc.w") - rnn.output(fc_without_b) - -out = rnn() -``` -The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. - -In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. - -To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. - -`SymbolTable` can do the following: - -- store the definitions (some names and attributes) of variables and operators, -- verify if a variable was declared, -- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). - - -```c++ -// Information in SymbolTable is enough to trace the dependency graph. So maybe -// the Eval() interface takes a SymbolTable is enough. -class SymbolTable { - public: - SymbolTable(SymbolTable* parent) : parent_(parent) {} - - OpDesc* NewOp(const string& name=""); - - // TODO determine whether name is generated by python or C++. - // Currently assume that a unique name will be generated by C++ if the - // argument name is left default. - VarDesc* Var(const string& name=""); - - // find a VarDesc by name, if recursive is true, find parent's SymbolTable - // recursively. - // this interface is introduced to support InferShape, find protobuf messages - // of variables and operators, pass pointers into InferShape. - // - // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should - // be proposed and embedded into pybind to enable python operation on C++ pointers. - VarDesc* FindVar(const string& name, bool recursive=true); - - OpDesc* FindOp(const string& name); - - BlockDesc Compile() const; - - private: - SymbolTable* parent_; - - map ops_; - map vars_; -}; -``` - -After all the description of variables and operators is added into SymbolTable, -the block has enough information to run. - -The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. - - -```c++ -namespace { - -class Block : OperatorBase { -public: - Block(const BlockDesc& desc) desc_(desc) {} - - void InferShape(const framework::Scope& scope) const override { - if (!symbols_ready_) { - CreateVariables(scope); - CreateOperators(); - } - // should run InferShape first. - for (auto& op : runtime_table_.ops()) { - op->InferShape(scope); - } - } - - void Run(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); - for (auto& op : runtime_table_.ops()) { - op->Run(scope, place); - } - } - - void CreateVariables(const framework::Scope& scope); - void CreateOperators(); - - // some other necessary interfaces of NetOp are listed below - // ... - -private: - BlockDesc desc_; - bool symbols_ready_{false}; -}; -``` - -## The Execution of Blocks - -Block inherits from OperatorBase, which has a Run method. -Block's Run method will run its operators sequentially. - -There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. - -The definition of Eval is as follows: - -```c++ -// clean a block description by targets using the corresponding dependency graph. -// return a new BlockDesc with minimal number of operators. -// NOTE: The return type is not a Block but the block's description so that this can be distributed -// to a cluster. -BlockDesc Prune(const BlockDesc& desc, vector targets); - -void Block::Eval(const vector& targets, - const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) { - BlockDesc min_desc = Prune(desc_, targets); - Block min_block(min_desc); - min_block.Run(scope, dev_ctx); -} -``` diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md deleted file mode 100644 index aabc1ba75a67c5767d409bd6e7e6240dec86b16c..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/cpp_data_feeding.md +++ /dev/null @@ -1,204 +0,0 @@ -# C++ Data Feeding - -While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. - -In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching. - -## Overview - -![](images/readers.png) - -## Reader - -In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data. - - -### ReaderBase - -`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers. - -```cpp -class ReaderBase { - public: - // Reads the next batch of data. (A 'batch' can be only one instance) - // If the next batch doesn't exist, it throws an exception - virtual void ReadNext(std::vector* out) = 0; - - // Checks whether the next instance exists. - virtual bool HasNext() = 0; - - // Reinitializes the reader and read the file from the beginning. - virtual void ReInit() = 0; - - virtual ~ReaderBase(); -}; -``` - -### FileReader - -`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format. - -```cpp -class FileReader : public ReaderBase { - public: - explicit FileReader(const std::vector& dims); - - void ReadNext(std::vector* out) override; - - protected: - virtual void ReadNextImpl(std::vector* out) = 0; - - private: - std::vector dims_; -}; -``` - -A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`. - -The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`. - -### DecoratedReader - -A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers. - -```cpp -class DecoratedReader : public ReaderBase { - public: - explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) { - PADDLE_ENFORCE_NOT_NULL(reader_); - } - - void ReInit() override { reader_->ReInit(); } - - bool HasNext() const override { return reader_->HasNext(); } - - protected: - ReaderBase* reader_; -}; -``` - -Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type. - -### MultipleReader - -All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`. - -So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling. - -![](images/multiple_reader.png) - -This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel. - -To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. - -### ReaderHolder - -Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code: - -```cpp -var->Get("batch_reader"); -``` - -We would have to write: - -```cpp -var->Get("batch_reader"); -``` - -This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible. - -To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader. - -## Related Operators - -To create and invoke readers, some new ops are introduced: - -### Operators That Create Readers - -Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. - -However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice. - -### OpenFilesOp - -The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names. - -To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors. - -### HasNextOp - -`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface. - -### ResetOp - -`ResetOp` is used to reset a reader via its `ReInit()` interface. - -### ReadOp - -A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables. - -## Program with Readers - -A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`. - -The ops of a `startup_program` with readers would be like this: - -``` -multiple_reader = open_files_op(...) -batch_reader = create_batch_reader_op(multiple_reader) -double_buffer_reader = create_double_buffer_op(batch_reader) -... (other initializers) -``` - -The forwarding ops of the corresponding `main_program` would be like this: - -``` -not_completed = true -pass_count = 0 -while_op(not_completed) { - has_next = has_next_op(double_buffer_reader) - if_else_op(has_next) { - batch_data = read_op(double_buffer_reader) - ... (subsequent training ops) - } else { - reset_op(double_buffer_reader) - increase_op(pass_count) - not_completed = less_than_op(pass_count, reqiured_pass_num) - } -} -``` - -A few important considerations for these programs are as follows: - -1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables. - -2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. - -3. All readers exist in both `startup_program` and `main_program`. And they are persistable. - -### Simplify Configuration by MultiPassReader - -The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`. - -`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`. - -With `MultiPassReader`, the startup program would be like this: - -``` -multiple_reader = open_files_op(...) -batch_reader = create_batch_reader_op(multiple_reader) -multi_pass_reader = create_multi_pass_reader_op(batch_reader) -double_buffer_reader = create_double_buffer_op(multi_pass_reader) -... (other initializers) -``` - -The forwarding part of the corresponding `main_program` would be like this: - -``` -not_completed = true -while_op(not_completed) { - batch_data = read_op(double_buffer_reader) - ... (subsequent training ops) - not_completed = has_next_op(double_buffer_reader) -} -``` diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md deleted file mode 100644 index 3fcddf4dd90f826ee1a16713f4371fb010f8eac5..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/executor.md +++ /dev/null @@ -1,29 +0,0 @@ -# Executor Design Doc - -## Motivation -In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message -[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). - -The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code. - -## Overview - -An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs. - -## Executor - -The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one. -It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process. - -### The interface -```c++ - Executor(places); -``` -A executor does not own any computing resources, a user can only construct an executor using the specified places. - -### Running an Executor - -``` - void Run(ProgramDesc, Scope, block_id, create_local_scope); -``` -An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished. diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md deleted file mode 100644 index 1f86b99e5197c3e0b85fd76fe704520ef21b06d3..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/functions_operators_layers.md +++ /dev/null @@ -1,128 +0,0 @@ -# Design Doc: Functions, Operators, and Layers - -In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator. - -Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation. - -In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions: - -```c++ -template T add(T x, T y) { return x + y; } -template T mul(T x, T y) { return x * y; } -``` - -Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation - -```c++ -#define MAKE_FUNCTION_OPERATOR(mul); -``` - -generates - -```c++ -template class mulOp : public OperatorBase {...}; -REGISTER_OP(mulOp, "mul"); -``` - -so that in Python we can create operator mul by: - -```python -X1 = Var() -X2 = Var() -Y = Var() -paddle.cpp.create_operator("mul", input=[X1, X2], output=Y) -``` - -Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`: - -```c++ -template -class FCOp : public OperatorBase { - public: - void Run(...) { - add(mul(Input("X"), Input("W")), Input("b")); - } -}; -REGISTER_OP(FCOp, "fc"); -``` - -We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API. - -Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`: - -```python -def operator.mul(X1, X2): - O = Var() - paddle.cpp.create_operator("mul", input={X1, Y1}, output=O) - return O - -def operator.add(X1, X2): - O = Var() - paddle.cpp.create_operator("add", input={X1, X2}, output=O) - return O -``` - -Above code snippets are automatically generated. Given them, users can define - -```python -def layer.fc(X): - W = Var() - b = Var() - return operator.add(operator.mul(X, W), b) -``` - -If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated: - -```python -def layer.fc(X): - W = Var() - b = Var() - O1 = Var() - paddle.cpp.create_operator("mul", input=[X, W], output=O1) - O2 = Var() - paddle.cpp.create_operator("add", input=[O1, b], output=O2) - return O2 -``` - -We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
C++ functions/functorsmuladd
C++ operator class mulOpaddOp FCOp
Python binding operator.mul operator.add operator.fc
Python function layer.fc
- - -This is how we differentiate layer and operators in PaddlePaddle: - -- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas -- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers. diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png deleted file mode 100644 index b22126b31db4982c13fc3a0827805e6aaf955046..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concepts/images/multiple_reader.png and /dev/null differ diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot deleted file mode 100644 index 40753cb140540c08d9d4c449b8d377e315280436..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/images/parallel_executor_overview.dot +++ /dev/null @@ -1,83 +0,0 @@ -digraph G { - subgraph cluster_init { - label="Initialization" - startup_program [label="startup", shape=box] - node_w_g0 [label="W\nGPU0"] - startup_program -> node_w_g0 [label="Initialize"] - node_w_g1 [label="W\nGPU1"] - node_w_g0 -> node_w_g1 [label="broadcast"] - } - - subgraph cluster_train { - label="forward_backward" - - subgraph cluster_gpu0 { - label="GPU0" - fc_0 [label="fc\nGPU0", shape=box] - hidden_0 [label="hidden\nGPU0"] - node_w_g0 -> fc_0 - fc_0 -> hidden_0 - loss0 [label="loss\nGPU0"] - hidden_0 -> loss0 [label="many ops omitted"] - scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box] - loss_g0 [label="loss_grad\nGPU0"] - scale_loss_0->loss_g0 - - fc_g_0 [label="w_grad\nGPU0", shape=box] - loss0 -> fc_g_0 - loss_g0 -> fc_g_0 - hidden_0 -> fc_g_0 - } - - subgraph cluster_gpu1 { - label="GPU1" - fc_1 [label="fc\nGPU1", shape=box] - hidden_1 [label="hidden\nGPU1"] - node_w_g1 -> fc_1 - fc_1 -> hidden_1 - loss1 [label="loss\nGPU1"] - hidden_1 -> loss1 [label="many ops omitted"] - scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box] - loss_g1 [label="loss_grad\nGPU1"] - scale_loss_1->loss_g1 - - fc_g_1 [label="w_grad\nGPU1", shape=box] - loss1 -> fc_g_1 - loss_g1 -> fc_g_1 - hidden_1 -> fc_g_1 - } - } - - all_reduce_w [label="Merge Gradients(AllReduce)", shape=box] - fc_g_0 -> all_reduce_w - fc_g_1 -> all_reduce_w - - fc_g_0_merged [label="w_grad\nMerged\nGPU0"] - fc_g_1_merged [label="w_grad\nMerged\nGPU1"] - all_reduce_w -> fc_g_0_merged - all_reduce_w -> fc_g_1_merged - - subgraph cluster_optimization { - label="Optimization" - subgraph cluster_opt_gpu0 { - label="GPU0" - sgd_0 [label="SGD Op\nGPU0", shape=box] - - fc_g_0_merged -> sgd_0 - node_w_g0 -> sgd_0 - optimized_w_0 [label="Optimized W\nGPU0"] - sgd_0 -> optimized_w_0 - } - subgraph cluster_opt_gpu1 { - label="GPU1" - sgd_1 [label="SGD Op\nGPU1", shape=box] - - fc_g_1_merged -> sgd_1 - node_w_g1 -> sgd_1 - optimized_w_1 [label="Optimized W\nGPU0"] - sgd_1 -> optimized_w_1 - } - } - - -} diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png deleted file mode 100644 index d890c0ffee3b38dc7cb74a2b56c2ab4831532211..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concepts/images/parallel_executor_overview.png and /dev/null differ diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png deleted file mode 100644 index fd59168ce16c9e2a0ef45303c28c997cfd7740be..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concepts/images/readers.png and /dev/null differ diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst deleted file mode 100644 index dcdc894937ff328e6002623275ca3c65e87b2bb0..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/index_cn.rst +++ /dev/null @@ -1,19 +0,0 @@ -核心概念 -------------- - -.. toctree:: - :maxdepth: 1 - - README.md - cpp_data_feeding.md - functions_operators_layers.md - program.md - variable.md - var_desc.md - tensor.md - tensor_array.md - lod_tensor.md - block.md - scope.md - executor.md - parallel_executor.md diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst deleted file mode 100644 index b85a3055746facaa642e8fc899976b58435f1ef2..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/index_en.rst +++ /dev/null @@ -1,19 +0,0 @@ -Core Concepts --------------------------------------- - -.. toctree:: - :maxdepth: 1 - - README.md - cpp_data_feeding.md - functions_operators_layers.md - program.md - variable.md - var_desc.md - tensor.md - tensor_array.md - lod_tensor.md - block.md - scope.md - executor.md - parallel_executor.md diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md deleted file mode 100644 index 748488f6d5f2f1272e87b89047570632418da8dc..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/lod_tensor.md +++ /dev/null @@ -1,211 +0,0 @@ -# Design Doc: LoD (Level-of-Detail) Tensor - -Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TensorFlowPaddlePaddle
RNN Support Support
recursive RNN Support Support
padding zeros Must No need
blob data type Tensor LoDTensor
- - -PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor. - - -## The Challenge: Variable-length Sequences - -Most deep learning systems represent a mini-batch as a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor. Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector. Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor. - -Both examples show that the elements of sequences are usually of the same size. In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors. It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors. - -The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences. Also, sequences might consist of sub-sequences. - - -## A Solution: The LoD Index - -To understand our solution, it is best to look at some examples. - -### A Mini-Batch of Sentences - -Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively. We can represent the mini-batch by a (3+1+2)xD tensor plus some index information: - -``` -3 1 2 -||| | || -``` - -where each `|` represents a D-dimensional word vector. The numbers, 3, 1, and 2, form a 1-level LoD. - -### Recursive Sequences - -Let check another example of a 2-level LoD Tensor. Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words: - -``` -3 1 2 -3 2 4 1 2 3 -||| || |||| | || ||| -``` - -### A Mini-Batch of Videos - -LoD tensors generalize to the case where elements are higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively. - -``` -3 1 2 -口口口 口 口口 -``` - -The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image. - -### A Mini-Batch of Images - -In traditional cases like a mini-batch with N fixed-sized images, the LoD Tensor representation is as - -``` -1 1 1 1 1 -口口口口 ... 口 -``` - -In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor: - -``` -口口口口 ... 口 -``` - -### Model Parameters - -A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**. - - -## The LoD Tensor - -Let us revisit above example of the 2-level LoD Tensor - -``` -3 1 2 -3 2 4 1 2 3 -||| || |||| | || ||| -``` - -It is indeed a tree, where leaves are elementary sequences identified by **branches**. - -For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4. - -### The LoD Index - -We can save the LoD index in the above example - -``` -3 1 2 -3 2 4 1 2 3 -``` - -in a not-full 2D matrix: - -```c++ -typedef std::vector > LoD; -``` - -where - -- `LoD.size()` is the number of levels, or the maximum length of branches, -- `LoD[i][j]` is the length of the j-th segment at the i-th level. - -## The Offset Representation - -To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences. - -In the above example, we accumulate the length of elementary sequences: - -``` -3 2 4 1 2 3 -``` - -into offsets - -``` -0 3 5 9 10 12 15 - = = = = = = - 3 2+3 4+5 1+9 2+10 3+12 -``` - -so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5. - -Similarly, the lengths in the top level LoD - -``` -3 1 2 -``` - -are transformed into offsets of elements/words as follows: - -``` -0 3 4 6 - = = = - 3 3+1 4+2 -``` - -## Slicing of LoD Tensors - - -When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch as the **-slice**. - -For example, the <2>-slice of above example is - -``` -10 15 -10 12 15 - || ||| -``` - -and the <2,0>-slice of above slice is - -``` -10 12 - || -``` - -## Length Representation vs Offset Representation - -The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult. -Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. -Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python: -```Python -# length representation of lod called recursive_sequence_lengths -recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]] -# Create a LoDTensor that has the above recursive_sequence_lengths info. -# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood. -tensor = fluid.LoDTensor(lod) - -# Set/Change the recursive_sequence_lengths info of LoDTensor -tensor.set_recursive_sequence_lengths([[3, 1, 2]]) -# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted -# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]] -new_recursive_seq_lens = tensor.recursive_sequence_lengths() -``` diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md deleted file mode 100644 index 4f88e27bed722e9f2f535e368926fe49b4e72e56..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/parallel_executor.md +++ /dev/null @@ -1,104 +0,0 @@ -# ParallelExecutor - -## Background - -Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side. - -The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`. - -We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. - -ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs. - - -## Overview of MultiGPUs logic - -The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters. - -![alt](images/parallel_executor_overview.png) - -There are several optimizations for this logic. - -1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization. -2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. - * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators. - * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler. -3. The streams of computation, merge gradients and fetch data are different. - -The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below. - -| Number of GPUs | 1 | 2 | 3 | 4| -| --- | --- | --- | --- | --- | -| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 | -| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 | - - -## Static single assignment Graph - -[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`. - -The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph. - -The data structure of `SSA` graph is: - -```c++ -struct VarHandleBase { - OpHandleBase* generated_op_; - vector pending_ops_; - - string name; - Place place; - size_t version; -}; - -struct OpHandleBase { - vector inputs_; - vector outputs_; -}; - -struct SSAGraph { - // vars on each devices. - // * the vars in each map in vector is on different device. - // * the map is mapping a variable name to variable handles - // with different versions - vector>> vars_; - - // All ops - vector ops_; -}; -``` -The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts. - -When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem. - -## Execute SSA Graph - -The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is - -1. Maintaining a map of an operator and its needed input number. -2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators. -3. If there is an operator which needed input number is decreased to zero, just run this operator. -4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated. - -Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph. - -## Synchronize GPU Kernels - -The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm: - -1. `OpHandle` will record `DeviceContext` that it is used. -2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable. - -The `wait` are implemented by two strategies: - -1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete. -2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU. - -Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime. - -## What's next? - -* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done. -* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too. -* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision. -* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator. diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md deleted file mode 100644 index cfcd21ecdb9d2844bf93ed98a56db09651077c40..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/program.md +++ /dev/null @@ -1,139 +0,0 @@ -# Design Doc: PaddlePaddle Programs - -## Compile and Execution - -A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. - -A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md): - -```python -x = layer.data("images") -l = layer.data("label") -y = layer.fc(x) -cost = layer.mse(y, l) -optimize(cost) -train(cost, reader=mnist.train()) -``` - -The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. - -## Programs and Blocks - -The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. - -- program: some nested blocks -- [block](./block.md): - - some local variable definitions, and - - a sequence of operators - -The concept of block comes from usual programs. For example, the following C++ program has three blocks: - -```c++ -int main() { // block 0 - int i = 0; - if (i < 10) { // block 1 - for (int j = 0; j < 10; j++) { // block 2 - } - } - return 0; -} -``` - -The following PaddlePaddle program has three blocks: - -```python -import paddle as pd // block 0 - -x = minibatch([10, 20, 30]) # shape=[None, 1] -y = var(1) # shape=[1], value=1 -z = minibatch([10, 20, 30]) # shape=[None, 1] -cond = larger_than(x, 15) # [false, true, true] - -ie = pd.ifelse() -with ie.true_block(): // block 1 - d = pd.layer.add_scalar(x, y) - ie.output(d, pd.layer.softmax(d)) -with ie.false_block(): // block 2 - d = pd.layer.fc(z) - ie.output(d, d+1) -o1, o2 = ie(cond) -``` - -## `BlockDesc` and `ProgramDesc` - -All protobuf messages are defined in `framework.proto`. - -`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. - -```protobuf -message BlockDesc { - required int32 parent = 1; - repeated VarDesc vars = 2; - repeated OpDesc ops = 3; -} -``` - -The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. - -All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. - -```protobuf -message ProgramDesc { - repeated BlockDesc blocks = 1; -} -``` - - -### Global Block - -The global block is the first one in the above array. - -## Operators that Use Blocks - -In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. - -The definition of `OpDesc` shows that an operator could have some attributes: - -```protobuf -message OpDesc { - AttrDesc attrs = 1; - ... -} -``` - -and an attribute could be of type block, which is, in fact, a block ID as described above: - -``` -message AttrDesc { - required string name = 1; - - enum AttrType { - INT = 1, - STRING = 2, - ... - BLOCK = ... - } - required AttrType type = 2; - - optional int32 block = 10; // when type == BLOCK - ... -} -``` - -## InferShape - -With this design, the InferShape function should take the following parameters: - -```c++ -void InferShape(int current_block, - int current_operator, - ProgramDesc* program // might change VarDesc values. - ) { - ... -} -``` - -where - -- `current_block` indices into `ProgramDesc::blocks`, -- `current_operator` indices into `BlockDesc::ops`. diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md deleted file mode 100644 index dffee8e02bacbc99bdfa8c54f1a146de340ad778..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/python_data_feeding.md +++ /dev/null @@ -1,130 +0,0 @@ -# Python Data Feeding - -In the former implementation of Paddle Fluid, there are two ways to feed data: - -- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details. - -- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance. - -In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue. - - -## Design of LoDTensorBlockingQueue -`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`. - -```C++ -class LoDTensorBlockingQueueHolder; - -class LoDTensorBlockingQueue { - friend class LoDTensorBlockingQueueHolder; - private: - // `LoDTensorBlockingQueue` can only be constructed by - // `LoDTensorBlockingQueueHolder::InitOnce()` - LoDTensorBlockingQueue(size_t capacity, const std::vector& dims); - - public: - size_t Size() const { return queue_.Size(); } // Get the current size of the queue - - size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue - - void Close() { return queue_.Close(); } - - bool IsClosed() const { return queue_.IsClosed(); } - - // Block if Size() == Cap() - // Return false only when queue_.IsClosed() == true - bool Push(const std::vector &lod_tensor_vec); - - // Block if Size() == 0. - // *Success == false when queue_.IsClosed() == true - std::vector Pop(bool *success = nullptr); - - private: - // Use reader::BlockingQueue as the inner data structure - BlockingQueue> queue_; - std::vector dims_; -}; - -class LoDTensorBlockingQueueHolder { - public: - // Call the constructor of `LoDTensorBlockingQueue` to create queue_ - // `InitOnce` can only called once, otherwise an exception would raise - void InitOnce(size_t capacity, const std::vector& dims) { - PADDLE_ENFORCE(queue_ == nullptr); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); - } - - const std::shared_ptr& GetQueue() const { return queue_; } - - private: - std::shared_ptr queue_; -}; -``` - -There are some major things that must be concerned: -- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. -- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called. -- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input. - - -## Release of the GIL in pybind -`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel. - - -## Design of PyReader -`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object. -```C++ -class PyReader : public ReaderBase { - public: - explicit PyReader(const std::shared_ptr& queue); - - void ReadNext(std::vector* out) override { - bool success; - *out = queue_->Pop(&success); - if (!success) out->clear(); - } - - void ReInit() override { return; } - - private: - std::shared_ptr queue_; -}; -``` - - -## Design of CreatePyReaderOp -`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable. -```C++ -class CreatePyReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - if (out->Get() != nullptr) return; - - const std::string& queue_name = Input("blocking_queue"); - auto* queue_holder_var = scope.FindVar(queue_name); - PADDLE_ENFORCE(queue_holder_var != nullptr); - auto* queue_holder = queue_holder_var - ->template GetMutable(); - out->Reset(new PyReader(queue_holder->GetQueue())); - } -}; -``` - -## Design of Python codes -The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned. -```Python -def py_reader(capacity, shapes): - queue_name = unique_name.generate("lod_tensor_blocking_queue") - var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable - feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue - out = create_var() - create_py_reader_op_with_queue_name( - inputs={'blocking_queue': queue_name}, - outputs={'Out':[out]}) - return out, feed_queue -``` diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md deleted file mode 100644 index dcf76649357aaef80d6bc1a933ece8c4c1063547..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/scope.md +++ /dev/null @@ -1,124 +0,0 @@ -# Design of Scope in Paddle - -## Overview - -Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes: - -- Scope is an association of a name to variable. -- Variables in a parent scope can be retrieved from local scope. - -A detailed explanation of these two attributes goes as following. - - -## Scope is an association of a name to variable. - -Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope. - - -1. Scope only contains a map of a name to variable. - - All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc. - -1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear. - -1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them. - `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables. - - `Create` is used to create a Variable by its name and add the mapping relation. - - `Get` is used to find a Variable by name. - -1. Every variable only belongs to one certain Scope. - - Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. - -1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. - - Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. - -```cpp -class Scope { - public: - Variable* Var(const std::string& name); - const Variable* FindVar(const std::string& name) const; - - private: - std::unordered_map> vars_; -}; -``` - - -## Parent scope and local scope - -Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope. - -1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed. -2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent. - -```cpp -class Scope { - public: - Scope(const std::shared_ptr& scope): parent_(scope) {} - - Variable* FindVar(const std::string& name) const { - auto it = vars_.find(name); - if (it != vars_.end()) { - return it->second.get(); - } else if (parent_ != nullptr) { - return parent_->FindVar(name); - } else { - return nullptr; - } - } - - private: - std::shared_ptr parent_ {nullptr}; -}; -``` - -In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr. - -A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. - -## Interface Design - -```cpp -class Variable { - private: - Variable() = default; - friend class Scope; -}; - -class Scope { - private: - Scope(const std::shared_ptr& parent = nullptr); - - public: - static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); - - // return nullptr if not found. - Variable* FindVar(const std::string& name) const; - - // return if already contains same name variable. - Variable* Var(const std::string& name); - - private: - std::shared_ptr parent_; - std::unordered_map> vars_; -}; -``` -## Only scope can create a variable - -To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. - -## When scope destroyed, all variables inside this scope should be destroyed together - -The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. - -## Sharing a parent scope - -Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed. - -Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer. - -## Orthogonal interface - -`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/fluid/design/concepts/tensor.md b/doc/fluid/design/concepts/tensor.md deleted file mode 100644 index 0a27ac9bb6b03649d42e12100fda9e80a56e7f56..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/tensor.md +++ /dev/null @@ -1,189 +0,0 @@ -# Tensor: An Unified Data Type in PaddlePaddle - -## Pain Point - -In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time. - - -## Learn from Majel - -Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor. - -To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates. - -1. `Place`: memory location [i.e. CPU/GPU]. -2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU]. -3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})] -4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory. - -If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value." - -As a simple example, consider the following: - -```c++ -#include "boost/variant.hpp" -#include - -class my_visitor : public boost::static_visitor -{ -public: - int operator()(int i) const - { - return i; - } - - int operator()(const std::string & str) const - { - return str.length(); - } -}; - -int main() -{ - boost::variant< int, std::string > u("hello world"); - std::cout << u; // output: hello world - - int result = boost::apply_visitor( my_visitor(), u ); - std::cout << result; // output: 11 (i.e., length of "hello world") -} -``` - -In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`. - -```c++ -template -struct Dim { -... -int head; -Dim tail; -} -``` - -```c++ -template -class Array : public Buffer { - ... -private: - Dim size_; - Dim stride_; - T* ptr_; -}; -``` - -```c++ -typedef boost::variant Place; -typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, - Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar; -typedef boost::variant< - Array, - Array, - Array, - Array, - - Array, - Array, - Array, - Array, - - Array, - Array, - Array, - Array > DArrayVar; -``` - -Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle. - -`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way: - - ```c++ - DArray arr = make_darray(make_ddim({2,3}), 0.0f); - ``` - It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 . - - The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this: - - ```c++ - arr[make_ddim({0, 1})] = 1.0; - ``` - -## Implement Tensor in Paddle - -We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers. - -Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware. - -Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel. - - -### Memory Management -`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle. - -### Dim and Array -#### Dim - -`Dim` decribes the dimension information of an array. - -`DDimVar` is an alias of a specializd class of boost.variant class template. - -`DDim` is introduced to represent a dynamically sized dimension. - -For example: - -``` -Dim<2> d1 = make_dim(3, 3); -DDim d2 = make_ddim({1, 2, 3}); -``` - -You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension. -#### Array - -`Array` represents for a tensor with specific type and size. - -`DArrarVar` is an alias of a specialized class of boost.variant class template. - -`DArray` is introduced to represent a dynamically typed array. - -For example: - -``` -Array a1(Dim<2>(2, 2)); -DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace()); -``` - -You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array. - - -Please reference the section of `Learn from Majel` for more details. - -### ArrayView - -`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on. - -`ArrayView` is an encapsulation of `Array`, which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView. - -`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely. - - -A global function `make_view` is provided to transform an array to corresponding arrayview. - -``` -template -ArrayView make_view(const Array& in) { - return in; -} -``` - -A global function `make_iterator` is provided to make iterator of an array. - -``` -template -ViewIterator> make_iterator(const Array& in, Dim idx) { - return make_iterator(make_view(in), idx); -} -``` - -### Basic Operations - -The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on. - -An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU). diff --git a/doc/fluid/design/concepts/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md deleted file mode 100644 index 37e4f7b90f94fa3eb015e733999cd84c96b2239c..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/tensor_array.md +++ /dev/null @@ -1,271 +0,0 @@ -# Design for TensorArray -This design doc presents the necessity of a new C++ class `TensorArray`. -In addition to the very simple C++ implementation - -```c++ -class TensorArray { - public: - explicit TensorArray(const LoDTensor&); - explicit TensorArray(size_t size); - - private: - vector values_; -}; -``` - -We also need to expose it to PaddlePaddle's Python API, -because users would want to use it with our very flexible operators `WhileLoop`. -An example for a RNN based on dynamic operators is - -```python -input = pd.data(...) -num_steps = Var(12) - -TensorArray states(size=num_steps) -TensorArray step_inputs(unstack_from=input) -TensorArray step_outputs(size=num_steps) - -W = Tensor(...) -U = Tensor(...) -default_state = some_op() - -step = Var(1) - -wloop = paddle.create_whileloop(loop_vars=[step]) -with wloop.frame(): - wloop.break_if(pd.equal(step, num_steps) - pre_state = states.read(step-1, default_state) - step_input = step_inputs.read(step) - state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) - states.write(step, state) - step_outputs.write(step, state) # output state - step.update(state+1) - -output = step_outputs.stack() -``` - -## Background -Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. - -An RNN can be implemented with the following pseudocode - -```c++ -Array states; -Array input_segments; -Array output_segments; -Parameter W, U; - -step = 1 -seq_len = 12 -while_loop { - if (step == seq_len) break; - states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); - output_segments[step] = states[step] // take state as output - step++; -} -``` -According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. - -Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. - - -Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). -Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. - -As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. - -The implementation is similar to `recurrent_op`. -The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** - - -Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, -the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. - -## Why `TensorArray` -The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. - -The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. - -So there should be an array-like container, which can store the segments of a tensor or LoD tensor. - -**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . -This is where the notion of `TensorArray` comes from. - -## Introduce TensorArray to uniform all the three RNNs -TensorArray as a new concept is borrowed from TensorFlow, -it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. - -This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, -such as `recurrent_op`, `RecurrentGradientMachine`. - -In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), -`TensorArray` is used to segment inputs and store states in all time steps. -By providing some methods similar to a C++ array, -the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. - -## Dynamic-operations on TensorArray - -`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented - -```python -# several helper operators for TensorArray -def tensor_array_stack(ta, tensor): - ''' - get a tensor array `ta`, return a packed `tensor`. - ''' - pass - -def tensor_array_unstack(tensor, ta): - ''' - get a `tensor`, unstack it and get a tensor array `ta`. - ''' - pass - -def tensor_array_write(ta, index, tensor, data_shared): - ''' - get a `tensor` and a scalar tensor `index`, write `tensor` into index-th - value of the tensor array `ta`. - `data_shared` is an attribute that specifies whether to copy or reference the tensors. - ''' - pass - -def tensor_array_read(ta, index, tensor): - ''' - get a tensor array `ta`, a scalar tensor `index`, read the index-th value of - `ta` and return as the `tensor`. - ''' - pass - -def tensor_array_size(ta, tensor): - ''' - get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. - ''' - pass -``` - -It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, -for example - -```python -class TensorArray: - def __init__(self, name): - self.name = name - self.desc = TensorArrayDesc() - - def stack(self, name=None): - ''' - Pack the values in a `TensorArray` into a tensor with rank one higher - than each tensor in `values`. - `stack` can be used to split tensor into time steps for RNN or whileloop. - - @name: str - the name of the variable to output. - ''' - tensor = Var(name) - tensor_array_stack(self.name, tensor) - return tensor - - def unstack(self, input): - ''' - Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. - `unstack` can be used to concatenate all the time steps for RNN or whileloop. - - @input: str - the name of input tensor - ''' - tensor_array_unstack(tensor, self.name) - - def write(self, index, value, data_shared=True): - ''' - Write value into index of the TensorArray. - If `data_shared` is set to True, than the index-th value in TensorArray will - be shared with the tensor passed in. - - @index: str - name of a scalar tensor - @value: str - name of a tensor - @data_shared: bool - ''' - tensor_array_write(self.name, index, value, data_shared) - - def read(self, index, output): - ''' - Read the value at location `index` in the `TensorArray`. - - @index: str - name of a scalar tensor - @output: - name of a output variable - ''' - tensor_array_read(self.name, index, output) - - - def size(self, output): - ''' - Return the number of values. - - @output: str - name of a scalar tensor - ''' - tensor_array_size(self.name, output) -``` - -## LoDTensor-related Supports -The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. - -Since each step of RNN can only take a tensor-represented batch of data as input, -some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. - -Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, -these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. - -Some definitions are like - -```python -def unpack(level): - ''' - Split LodTensor in some `level` and generate batches, if set `sort_by_length`, - will sort by length. - - Returns: - - a new `TensorArray`, whose values are LodTensors and represents batches - of data. - - an int32 Tensor, which stores the map from the new batch's indices to - original LoDTensor - ''' - pass - -def pack(level, indices_map): - ''' - Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` - and `level` and `indices_map`. - ''' - pass -``` - -With these two methods, a varience-length sentence supported RNN can be implemented like - -```c++ -// input is the varient-length data -LodTensor sentence_input(xxx); -TensorArray ta; -Tensor indice_map; -Tensor boot_state = xxx; // to initialize rnn's first state -TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map); -TessorArray step_outputs; -TensorArray states; - -for (int step = 0; step = ta.size(); step++) { - auto state = states.read(step); - // rnnstep is a function which acts like a step of RNN - auto step_input = ta.read(step); - auto step_output = rnnstep(step_input, state); - step_outputs.write(step_output, true/*data_shared*/); -} - -// rnn_output is the final output of an rnn -LoDTensor rnn_output = ta.pack(ta, indice_map); -``` -the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, -the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md deleted file mode 100644 index 8db67f6703d142da71cf06bd4f7e2cb13556f9b0..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/var_desc.md +++ /dev/null @@ -1,100 +0,0 @@ -# Design Doc: Var_desc - -## Background -PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations. - -PaddlePaddle uses proto message to describe compile time program because : - -1. The computation program description must be serializable and saved in a file. -1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker. - -The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below. - - - - - - - - - - - - - - - - - - - - - -
compile timeruntime
Data VarDesc(proto) Variable(cpp)
Operation OpDesc(proto) Operator(cpp)
- - -## Definition of VarType - -A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following: - -```proto -message VarDesc { - required string name = 1; - required VarType type = 2; - optional bool persistable = 3 [ default = false ]; -} -``` - -## Definition of TensorDesc - -```proto -message TensorDesc { - // Should only be PODType. Is enforced in C++ - required Type data_type = 1; - repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] -} -``` - -The `Type` here comes from the enum defined inside of `VarType` : - -```proto -enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - - // Other types that may need additional descriptions - LOD_TENSOR = 7; - SELECTED_ROWS = 8; - FEED_MINIBATCH = 9; - FETCH_LIST = 10; - STEP_SCOPES = 11; - LOD_RANK_TABLE = 12; - LOD_TENSOR_ARRAY = 13; - PLACE_LIST = 14; - READER = 15; - CHANNEL = 16; -} -``` - -A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). - -## Definition of LodTensorDesc - -```proto -message LoDTensorDesc { - required TensorDesc tensor = 1; - optional int32 lod_level = 2 [ default = 0 ]; -} -``` - -A LoDTensorDesc contains a tensor and a lod_level. - -## Definition of Variable in Python - -For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/fluid/design/concepts/variable.md b/doc/fluid/design/concepts/variable.md deleted file mode 100644 index 442ef6b718b227d79ca73031efcbb55817558252..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concepts/variable.md +++ /dev/null @@ -1,52 +0,0 @@ -# Design Doc: Variable - - -Variable is also known as *blob* in MxNet and Caffe2. It is the input and output type of operators, where a neural network is a graph of operators. - -## Requirements: Lazy Memory Allocation - -For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN. - -To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation. Let's take the following example: - -```cpp -Variable vr, v1, v2; - -Tensor* t1 = new Tensor(); -Tensor* t2 = new Tensor(); - -Randomize( - /* malloc */ v1.GetMutable().mutable_data(DDim(100,200)), - /* size */ t1.Size()); - -Randomize( - /* malloc */ v2.GetMutable().mutable_data(DDim(200,300)), - /* size */ t2.Size()); - -Mult( - /*result*/ vr.GetMutable().mutable_data(SizeOfMult(v1, v2)), - /*input1*/ v1.Get().data(), - /*input2*/ v2.Get().data()); -``` - -We see that a variable holds nothing until `Variable::GetMutable()` allocates a tensor and puts it in the variable. Similarly, a tensor gets its memory until `Tensor::mutable_data()`. - -This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code. - - -## Implementation: Type Hiding - -To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time. In other words, `class Variable` cannot be a template `template class Variable`. - -Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member. Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`. - -But anyway, Variable needs to know `T` so could it `delete(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type. - -We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`. Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter. - -Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`. - - -## Conclusion - -The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid). This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code. diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md deleted file mode 100644 index df67438bcc741ac521b00ee962fc13c93db21182..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/channel.md +++ /dev/null @@ -1,139 +0,0 @@ -# Channel Design - -## Introduction - -A Channel is a data structure that allows for synchronous interprocess -communication via message passing. It is a fundemental component of CSP -(communicating sequential processes), and allows for users to pass data -between threads without having to worry about synchronization. - -## How to use it - -Paddle offers python APIs to open and close channels, along with sending -and receiving data to/from a channel. - -### Create a channel - -Creates a new channel that takes in variables of a specific dtype. - -- **fluid.make_channel(dtype, capacity=0)** - - **dtype**: The data type of variables being sent/received through channel - - **capacity**: The capacity of the channel. A capacity of 0 represents - an unbuffered channel. Capacity > 0 represents a buffered channel - -``` -ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10) -``` - -### Close a channel - -Closes a channel. Any pending senders and receivers will be awoken during -this time. Receivers can still receive from a closed channel, but senders -are not allowed to send any additional data to the channel (Paddle will -raise an exception if users try to send to a closed channel.) - -- **fluid.channel_close(channel)** - -``` -fluid.channel_close(ch) -``` - -### Send data to a channel - -Sends a variable to a channel. Currently, variables of dtype `LoDTensor`, -`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and -`ChannelHolder` are supported. - -By default, the data of the Variable is moved from the sender to the receiver, -however the user can optionally copy the data before performing the send. - -- **channel_send(channel, variable, is_copy=False)** - - **channel**: The channel to send the variable to - - **variable**: The variable to send to the channel - - **is_copy**: If set to True, channel_send will perform a variable assign - to copy the source variable to a new variable to be sent. - -``` -ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) -var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100) -fluid.channel_send(ch, var, True) -``` - -### Receive data from a channel - -Receives a variable from a channel. The data of the variable is moved to the -receiving variable. - -- **channel_recv(channel, return_variable)** - - **channel**: The channel to receive the variable from - - **return_variable**: The destination variable used to store the data of the - variable received from the channel - -``` -ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) -var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1) -fluid.channel_recv(ch, var) -``` - -## How it Works - -Channels provides a simple interface for different threads to share data. -To support the synchronization requirements, channels utilizes a series of -internal queues, locks, and conditional variables. - -### QueueMessage - -QueueMessage encapsulates the state of the channel send/receive operation to be -put in the **sendq/recvq**. It contains a condition variable used to lock the -thread (when there are no available sends/receives). In addition, it contains -a callback function to notify a thread when the QueueMessage is being -processed by the channel. - -### Queues - -- **buff_**: This queue holds the data buffer in a buffered channel. The -capacity is set to the capacity of the channel. This data buffer is not -used in an unbuffered channel. - -- **sendq**: This queue holds the QueueMessage of any pending senders of a -channel. When a thread performs a channel_send operation on the channel, the -channel_send operation will put a new QueueMessage on the sendq and block the -current thread under two conditions: - 1. The channel is buffered and is full - 2. The channel is unbuffered and does not have a receiver - -- **recvq**: This queue holds the QueueMessage of any pending receivers of a -channel. When a thread performs a channel_recv operation on the channel, the -channel_recv operation will put a new QueueMessage on the recvq and block the -current thread under two conditions: - 1. The channel is buffered and there is no data on the buff_ - 2. The channel is unbuffered and does not have a sender - -### State diagram - -#### Channel Send - -

-
-

- -#### Channel Receive - -

-
-

- -## Limitations and Considerations - -### Variable Copy - -In golang, variables in channels are copied from the sender to the receiver. -In Paddle, the data from our variables are **moved** from sender to receiver. -As a result, these variables should not be used after they are sent. We -provide a flag in channel_send method to allow users to copy the variable to -be sent before it is sent. - -Please note that this is acheived by adding an **assign** operator and creating -a temporary variable that is sent in place of the original variable. Please -note that **assign** operator has limited support for only certain variables -datatypes. diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md deleted file mode 100644 index 0428e74f9e00a87f6b0972057f48479b8ae56ad6..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/concurrent_programming.md +++ /dev/null @@ -1,193 +0,0 @@ -# Design Doc: Concurrent Programming with Fluid - -With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model. - -Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?** - -The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org). - -## An Analogy - -The following table compares concepts in Fluid and Go - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
GoFluid
user-defined functions -layers
control-flow and built-in functions -intrinsics/operators
goroutines, channels -class ThreadPool
runtime -class Executor
- - -## An Example Concurrent Program - -To review all above concepts in an example, let us take a simple program and writes its distributed version. - -Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors. - -```go -import "fluid" - -func paddlepaddle() { - X = fluid.read(...) - W = fluid.Tensor(...) - Y = fluid.mult(X, W) -} -``` - -Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message. - -```protobuf -message ProgramDesc { - block[0] = Block { - vars = [X, W, Y], - ops = [ - read(output = X) - assign(input = ..., output = W) - mult(input = {X, W}, output = Y) - ], - } -} -``` - -Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message. - -The default `main` function is defined as follows: - -```go -func main() { - paddlepaddle() - fluid.run() -} -``` - -## The Concurrent Version - -By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication. - -In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes. - -### The Master Program - -The master program could look like the following: - -```protobuf -message ProgramDesc { - block[0] = Block { - vars = [X, L, Y], - ops = [ - read(output = X) - kube_get_workers_addrs(output = L) - Y = tensor_array(len(L)) - parallel_for(input = X, output = Y, - attrs = {L, block_id(1)}) # referring to block 1 - ] - } - - block[1] = Block { - parent = 0, - vars = [x, y, index], - ops = [ - slice(input = [X, index], output = x) # index is initialized by parallel_for - send(input = x, attrs = L[index]) - recv(outputs = y, attrs = L[index]) - assign(input = y, output = Y[index]) - ] - } -} -``` - -The equivalent Fluid program (calling the Go binding) is: - -```go -func main() { //// block 0 - X = fluid.read(...) - L = fluid.k8s.get_worker_addrs() - Y = fluid.tensor_array(len(L)) - fluid.parallel_for(X, L, - func(index int) { //// block 1 - x = X[index] - fluid.send(L[index], x) - y = fluid.recv(L[index]) - Y[index] = y - }) -} -``` - -An explanation of the above program: - -- `fluid.k8s` is a package that provides access to Kubernetes API. -- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod). -- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, - - 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and - 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread - 1. creates an Executor instance, and - 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above. -1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0. - -### The Worker Program - -The worker program looks like - -```go -func main() { - W = Tensor(...) - x = fluid.listen_and_do( - fluid.k8s.self_addr(), - func(input Tensor) { - output = fluid.mult(input, W) - }) -} -``` - -where - -- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed, - 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`, - 2. once a connection is established, - 1. creates a scope of two parameters, "input" and "output", - 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input", - 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`. - -## Summarization - -From the above example, we see that: - -1. Fluid enables the imperative programming paradigm by: - 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and - 2. call the `fluid.run` function that runs the program implicitly. -1. The program is described as a `ProgramDesc` protobuf message. -2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter. -3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message. -4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array. -5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request. -6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1). diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md deleted file mode 100644 index 66d19f44baf861c7847e81ca83f61024ec877faf..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/csp.md +++ /dev/null @@ -1,251 +0,0 @@ -# Design Doc: CSP in PaddlePaddle Fluid - -## Motivation - -Concurrent programming is important for deep learning. Few example applications are: - -1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing. -2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server. - -Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language. - -## Concurrent Programming Models - -There were many concurrent programming models, implemented in various forms: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
concurrent programming modelimplementation
mutex types and functions in standard libraries
semaphore types and functions in standard libraries
communicating sequential processes (CSP) Go programming language
actor model Erlang programming language
message passing MPI
bulk synchronous parallel (BSP) Pregel distributed programming framework
- - -Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid. - -### CSP v.s. Actor Model - -A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv. - -## CSP in Fluid - -Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following: - -1. a new data type: *channel* and operators *send* and *recv*, -1. *goroutine* or thread, and -1. a new control-flow: select. - -We also need Python wrappers for the above components. - -The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv. - -The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll. - -It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax. - -### Type Channel - -Fluid supports many data types: - -1. Tensor, -1. Row-sparse Tensor -1. LoD Tensor, -1. Tensor array, etc - -Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum. - -To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor. - -## Syntax Design - -### Create Channel - -In Go, we create a channel by specifying the element type and buffer size: - -```go -ch := make(chan int) // a channel without buffer -ch1 := make(chan int, 100) // a channel that can buffer 100 ints. -``` - -In Fluid, we should be able to do the same: - -```python -ch = fluid.make_channel(dtype=INT) -ch1 = fluid.make_channel(dtype=INT, 100) -``` - -In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16: - -```python -ch = fluid.make_channel(dtype=Tensor, etype=float16) -``` - -or Tensors of Tensors of float16 etc. - -The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`. - -### Send and Recv - -Go's CSP implementation depends on data type *channel*. There are two types of channels: - -1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty. -1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels. - -There are four types of actions with a channel: - -1. Create a channel - - ```go - ch := make(chan int) // this is an unbuffered channel - ch := make(chan int, 100) // this is a buffered channel of 100 ints. - ``` - -1. Send - - ```go - ch <- 111 - ``` - -1. Recv - - ```go - y, ok <- ch - ``` - -1. Close - - ```go - close(ch) - ``` - - Please be aware that a closed channel is not a nil channel, which is `var ch chan int`. - -There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms): - -1. A send to a nil channel blocks forever - -1. A receive from a nil channel blocks forever - -1. A send to a closed channel panics - -1. A receive from a closed channel returns the residual values and then zeros. - -In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h) - -The following program illustrates the Python syntax for accessing Fluid buffers. - -```python -import fluid - -buffer_size = 10 -ch = fluid.make_channel(dtype=INT, buffer_size) - -# Now write three elements to the channel -with fluid.while(steps=buffer_size): - fluid.send(ch, step) - -fluid.close_channel(ch) - -with fluid.while(steps=buffer_size): - fluid.print(fluid.recv(ch)) -``` - -The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines. - -```python -import fluid - -ch = fluid.make_channel(dtype=INT) - -with fluid.go(): - fluid.send(ch) - -y = fluid.recv(ch) - -fluid.close_channel(ch) -``` - -### Select - -In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready. - -```go - -ch1 := make(chan int) -ch2 := make(chan int, 100) - -x := 0 - -for { - select { - case ch1 <- x: - x := x + 1 - case y <- ch2: - fmt.Println("Received on channel") - default: - fmt.Println("Default") - } - } - -``` - -In Fluid, we should be able to do the same: - -```python -ch1 = fluid.make_chan(dtype=INT) -ch2 = fluid.make_chan(dtype=INT, 100) - -sel = fluid.select() - -with sel.case(ch1, 'w', X): - fluid.layers.increment(X) - -with sel.case(ch2, 'r', Y): - fluid.print("Received on Channel") - -with sel.default(): - fluid.print("Default") - -``` - -In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one. - -- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O. - -- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O. - -- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed. - -## Example Programs - -### 1. RPC between Trainers and Parameter Servers - -### 2. Concurrent Minibatch Loading diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md deleted file mode 100644 index c18b788e80f432ebb2f14b15229e7823c112001e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/go_op.md +++ /dev/null @@ -1,231 +0,0 @@ -# go_op Design - -## Introduction - -The **go_op** allows user's of PaddlePaddle to run program blocks on a detached -thread. It works in conjuction with CSP operators (channel_send, -channel_receive, channel_open, channel_close, and select) to allow users to -concurrently process data and communicate easily between different threads. - -## How to use it - -``` -channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - -with fluid.Go(): - # Send a tensor of value 99 to "channel" on a detached thread - tensor = fill_constant(shape=[1], dtype='int', value=99) - tensor.stop_gradient = True - fluid.channel_send(channel, tensor) - -# Receive sent tensor from "channel" on the main thread -result = fill_constant(shape=[1], dtype='int', value=-1) -fluid.channel_recv(ch, result) -``` - -The go operator can be accessed by using the fluid.Go() control flow. This -will create a new sub block, where the user can add additional operators -to be ran on the thread. - -**Note:** Since back propegation is currently not support in the go_op, users -should ensure that operators in the go block does not require gradient -calculations. - -## How it Works - -Similar to other control blocks, go_op will create a sub block and add it -as a child to the current block. Operators and variables defined in this -block will be added to the go sub_block. - -In addition, the go operator will create a new child scope whose parent is -the global scope. Please refer to [block captures](#block-captures) for more -information. - -When Paddle executor runs go_op, go_op will take the sub_block and pass it to -the executor.run method (along with a newly created local scope) on a detached -thread. - -An example of the generated program description is shown below. Take note of -the **go_op** in particular. It is added as an operator in the current -block (in this example, block0). The **go_op** contains a `sub_block` -attribute, which points to the id of the block that will be executed in a -detached thread. - -``` -blocks { - idx: 0 - parent_idx: -1 - vars { - name: "return_value" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: INT64 - } - } - } - } - vars { - name: "status_recv" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: BOOL - } - } - } - } - ... - ops { - outputs { - parameter: "Out" - arguments: "channel" - } - type: "channel_create" - attrs { - name: "data_type" - type: INT - i: 7 - } - attrs { - name: "capacity" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "channel" - } - type: "go" - attrs { - name: "sub_block" - type: BLOCK - block_idx: 1 - } - } - ops { - inputs { - parameter: "Channel" - arguments: "channel" - } - outputs { - parameter: "Out" - arguments: "return_value" - } - outputs { - parameter: "Status" - arguments: "status_recv" - } - type: "channel_recv" - } - ... -} - -blocks { - idx: 1 - parent_idx: 0 - vars { - name: "status" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: BOOL - } - } - } - } - ... - - ops { - outputs { - parameter: "Out" - arguments: "fill_constant_1.tmp_0" - } - type: "fill_constant" - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: 99.0 - } - attrs { - name: "shape" - type: INTS - ints: 1 - } - attrs { - name: "dtype" - type: INT - i: 3 - } - } - ops { - inputs { - parameter: "Channel" - arguments: "channel" - } - inputs { - parameter: "X" - arguments: "fill_constant_1.tmp_0" - } - outputs { - parameter: "Status" - arguments: "status" - } - type: "channel_send" - attrs { - name: "copy" - type: BOOLEAN - b: false - } - } -``` - -## Current Limitations - -#### Scopes and block captures: - -Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a -block. When a block is executed, a new local scope is created from the parent -scope (ie: scope derived from the parent block) and associated with the new -child block. After the block finishes executing, then the local scope and -all associated variables in the scope is deleted. - -This works well in a single threaded scenario, however with introduction of -go_op, a child block may continue to execute even after the parent block has -exited. If the go_op tries to access variables located in the parent block's -scope, it may receive a segmentation fault because the parent scope may have -been deleted. - -We need to implement block closures in order to prevent access to parent -scope variables from causing a segmentation fault. As a temporary workaround, -please ensure that all variables accessed in the go block is not destructed -before it is being accessed. Currently, the go_op will explicitly enforce -this requirement and raise an exception if a variable could not be found in -the scope. - -Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502) -for more details. - -#### Green Threads - -Golang utilizes `green threads`, which is a mechnism for the runtime library to -manage multiple threads (instead of natively by the OS). Green threads usually -allows for faster thread creation and switching, as there is less overhead -when spawning these threads. For the first version of CSP, we only support -OS threads. - - -#### Backward Propegation: - -go_op currently does not support backwards propagation. Please use go_op with -non training operators. diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png deleted file mode 100644 index c06cd15ae7b8a8c94d5742f6675e389081fcf789..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concurrent/images/channel_recv.png and /dev/null differ diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png deleted file mode 100644 index 006ebb4a5a4bcd32c97847e9fb7729a740255f7c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concurrent/images/channel_send.png and /dev/null differ diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png deleted file mode 100644 index 719ed76f9d542d6c4f20c30f27656bb53325aa85..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/concurrent/images/select_op_workflow.png and /dev/null differ diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst deleted file mode 100644 index e47135e9fc42760898083710e0a6767252a0225b..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -并发编程 ------------- - -.. toctree:: - :maxdepth: 1 - - concurrent_programming.md - parallel_do.md diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst deleted file mode 100644 index 0727e75798b2a869588f80d3cce7a886554e4ffb..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/index_en.rst +++ /dev/null @@ -1,8 +0,0 @@ -Concurrent Programming -------------------------- - -.. toctree:: - :maxdepth: 1 - - concurrent_programming.md - parallel_do.md diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md deleted file mode 100644 index 42bd136f825986d94fafaeaa5f58edb02848a74c..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/parallel_do.md +++ /dev/null @@ -1,163 +0,0 @@ -# Design Doc: Parallel_Do in PaddlePaddle - -In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing. - -## Design overview - -The definition of a parallel_do op looks like the following - -```c++ -AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable(); -AddInput(kParameters, "Parameters are duplicated over different devices") - .AsDuplicable(); -AddInput(kPlaces, "Devices used for parallel processing"); -AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable(); -AddOutput(kParallelScopes, - "Scopes for all local variables in forward pass. One scope for each device"); -AddAttr(kParallelBlock, - "List of operaters to be executed in parallel"); -``` - -A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and -`||||` means multiple threads) - -``` -In the forward pass - | Split input onto different devices - | Copy parameter onto different devices - |||| Compute forward pass in parallel - | Merge output from different devices - -In the backward pass - | Split output@grad onto different devices - |||| Compute backward pass in parallel - | accumulate param@grad from different devices to the first device - | Merge input@grad from different devices -  | Copy param@grad to the place of parallel_do_op -``` - -This implementation allows to write mixed device program like this - -```python -W1 = fluid.tensor(size=[100,20], parameter=true) -W2 = fluid.tensor(size=[20,15], parameter=true) - -data = layers.data() - -gpu_places = layers.get_place(use_gpu=True) -# parallel processing on multiple GPUs -pd = ParallelDo(gpu_places) -with pd.do(input=data): - prediction = softmax(fc(fc(data, W1), W2)) - write_output(prediction) -prediction = pd() -loss = cross_entropy(prediction, label) -``` - -And the programDesc are like the following - -``` -# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU -start_program -{ - vars: w1, w2 - ops: init(w1), init(w2) -} - -main_program -{ -block0 { - vars: data, places, w1, w2, w1_grad, w2_grad, - ops: data, get_place, parallel_do(block1), - parallel_do_grad(block2), - sgd(w2, w2_grad), - sgd(w1, w1_grad) -} -block1 { # the forward pass - parent_block: 0 - vars: data, h1, h2, loss - ops: fc, fc, softmax -} -block2 { # the backward pass - parent_block: 1 - vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad - ops: softmax_grad, - fc_grad - fc_grad -} -} -``` - -## Performance Imporvement - -There are serial places we can make this parallel_do faster. - -### forward: split input onto different devices - -If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by -prefetching the input onto different devices in a seperate background thread. And the python code -looks like this. -```python -pd = ParallelDo(gpu_places) -with pd.do(): -    feature = get_data_from_prefetch_queue(gpu_places) - prediction = my_net(feature) - write_output(activation) -``` - -### forward: Copy parameter to onto different devices - -We can avoid this step by making each device have a copy of the parameter. This requires: - -1. `fluid.default_start_up_program()` to be run on all devices -1. In the backward, allreduce param@grad at different devices, this requires - 1. `backward.py` add `allreduce` operators at parallel_do_grad - 1. `allreduce` operators need to be called in async mode to achieve maximum throughput -1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel - -By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device". -And the ProgramDesc looks like the following - -``` -# w1, w2 will be allocated on all GPUs -start_program -{ -block0 { - parallel_do(block1) -} -block1 { - parent_block: 0 - vars: w1, w2 - ops: init(w1), init(w2) -} -} - -main_program -{ -block0 { - vars: data, places, w1, w2 - ops: data, get_place, parallel_do(block1), - parallel_do_grad(block2), # append_backward - parallel_do(block3) # append_optimization - -} -block1 { - parent_block: 0 - vars: data, h1, h2, loss - ops: fc, fc, softmax -} -block2 { - parent_block: 1 - vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad - ops: softmax_grad, - fc_grad, allreduce(places, scopes, w1_grad), - fc_grad, allreduce(places, scopes, w2_grad) -} -block3 { - parent_block: 0 - vars: lr - ops: sgd(w2, w2_grad), - sgd(w1, w1_grad) -} -} -``` diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md deleted file mode 100644 index 4fcae57cc7932cdaebe549486e7f7cebf0bd038a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/concurrent/select_op.md +++ /dev/null @@ -1,265 +0,0 @@ -# select_op Design - -## Introduction - -In golang, the [**select**](https://golang.org/ref/spec#Select_statements) -statement lets a goroutine wait on multiple communication operations at the -same time. The **select** blocks until one of its cases can run, then -executes the case. If multiple cases are ready to run, then one case is -choosen at random to be executed. - -With the introduction of CSP for Paddle, we mimic this behavior by -creating a ***select_op***. - -## How to use it - -The **select_op** is available as a c++ operator. However most users -will prefer to use the much simplier Python API. - -- **fluid.Select()**: Creates a select operator and adds it to the current -block within the main program. Also creates a sub block and adds it to the -main program. This sub block is used to hold all variables and operators -used by the case statements. - -Within the select block, users can add cases by -calling **select.case** or **select.default** method. - -- **fluid.Select.case(channel_action, channel, result_variable)**: Represents -a fluid channel send/recv case. This method creates a SelectCase block -guard and adds it to the Select block. The arguments into this method tells -the select which channel operation to listen to. - -- **fluid.Select.default()**: Represents the fluid default case. This default -case is executed if none of the channel send/recv cases are available to -execute. - -**Example:** -``` -ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) -quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) - -x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) -y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1) - -while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True) -while_op = While(cond=while_cond) - -with while_op.block(): - with fluid.Select() as select: - with select.case(fluid.channel_send, channel, x): - # Send x, then perform Fibonacci calculation on x and y - x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - assign(input=x, output=x_tmp) - assign(input=y, output=x) - assign(elementwise_add(x=x_tmp, y=y), output=y) - with select.case(fluid.channel_recv, quit_channel, result2): - # Exit out of While loop - while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False) - helper = layer_helper.LayerHelper('assign') - helper.append_op( - type='assign', - inputs={'X': [while_false]}, - outputs={'Out': [while_cond]}) -``` - -## How it Works - -### Program Description - -``` -blocks { - idx: 0 - ... - // Create "case_to_execute" variable - ops { - outputs { - parameter: "Out" - arguments: "fill_constant_110.tmp_0" - } - type: "fill_constant" - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: -1.0 - } - attrs { - name: "shape" - type: INTS - ints: 1 - } - attrs { - name: "dtype" - type: INT - i: 2 - } - } - // Create "select" operator. - // inputs: - // X: All input variables used by operators within the select block - // case_to_execute: Variable filled in by select_op when it determines - // which case to execute. - // - // outputs: - // Out: All output variables referenced by operators within select block. - // - // attrs: - // sub_block: The block id containing the select "cases" - // cases: Serialized list of all cases in the select op. - // Each case is serialized as: ',,,' - // where type is 0 for default, 1 for send, and 2 for receive. - // No channel and values are needed for default cases. - ops { - inputs { - parameter: "X" - arguments: "fill_constant_103.tmp_0" - arguments: "fill_constant_104.tmp_0" - } - inputs { - parameter: "case_to_execute" - arguments: "fill_constant_110.tmp_0" - } - outputs { - parameter: "Out" - arguments: "fill_constant_110.tmp_0" - } - type: "select" - attrs { - name: "sub_block" - type: BLOCK - block_idx: 1 - } - attrs { - name: "cases" - type: STRINGS - strings: "0,1,channel_101,fill_constant_109.tmp_0" - strings: "1,2,channel_102,fill_constant_108.tmp_0" - } - } - ... -} -``` - -The python select API will add the **select_op** to the current block. In addition, it will -iterate through all it's case statements and add any input variables required by case statements -into **X**. It will also create a temp variable called **case_to_execute**. This variable is -filled in by the select_op after it has completed processing the case statements. - -If there are no available cases to execute (ie: all cases are blocked on channel operations, and -there is no default statement), then the select_op will block the current thread. The thread will -unblock once there is a channel operation affecting one of the case statements, at which point, the -**select_op** will set the **case_to_execute** variable to the index of the case to execute. - -Finally the select_op will call executor.run on the **sub_block**. - -``` -blocks { - idx: 1 - parent_idx: 0 - ... - // Fill a tensor with the case index (ie: 0,1,2,3,ect.) - ops { - outputs { - parameter: "Out" - arguments: "fill_constant_111.tmp_0" - } - type: "fill_constant" - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: INTS - ints: 1 - } - attrs { - name: "dtype" - type: INT - i: 2 - } - } - // Create an "equal" operator to compare the case index with the "case_to_execute" - // tensor (which was filled in by the select op). - ops { - inputs { - parameter: "X" - arguments: "fill_constant_111.tmp_0" // case 0 - } - inputs { - parameter: "Y" - arguments: "fill_constant_110.tmp_0" // case_to_execute - } - outputs { - parameter: "Out" - arguments: "equal_0.tmp_0" - } - type: "equal" - attrs { - name: "axis" - type: INT - i: -1 - } - } - // Use the output of the "equal" operator as a condition for the "conditional_block". - // If the condition evaluates to true, then execute the "sub_block" (which represents - // the select case's body) - ops { - inputs { - parameter: "Params" - } - inputs { - parameter: "X" - arguments: "equal_0.tmp_0" - } - outputs { - parameter: "Out" - } - outputs { - parameter: "Scope" - arguments: "_generated_var_0" - } - type: "conditional_block" - attrs { - name: "is_scalar_condition" - type: BOOLEAN - b: true - } - attrs { - name: "sub_block" - type: BLOCK - block_idx: 4 - } - } - ... - // Repeat the above operators for each case statements inside the select body -} - -``` - -Cases are represented by a **conditional_block operator**, whose's condition is set as the output of -equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block, -only one case will be executed. - -### select_op flow - -

-
-

- -The select algorithm is inspired by golang's select routine. Please refer to -http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information. - -## Backward Pass - -TODO diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md deleted file mode 100644 index 844d2aafcf257b85057e1ac200ed3d5cf0be2ff0..0000000000000000000000000000000000000000 --- a/doc/fluid/design/data_type/float16.md +++ /dev/null @@ -1,183 +0,0 @@ -# Design Doc: float16 - -## Why float16 -Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. - -When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially - -- reduce storage space, memory bandwidth, and power usages; -- increase the chance of data fitting into a smaller cache of lower latency; -- provide arithmetic speed up if supported by hardware. - -## Survey of current float16 support -A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info. - -The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. - -### Compiler -- nvcc supports `__half` data type after CUDA 7.5. -- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4. -- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9. - -### Hardware -- `__half` is supported on GPU with compute capability >= 5.3. -- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above. -- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018). - -### Libraries -- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. -- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). - -### CUDA version issue -There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. -CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: -``` -typedef struct __align__(2) { - unsigned short x; -} __half; - -typedef __half half; -``` -This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: -``` -__global__ void Add() { - half a, b, c; - c = __hadd(a, b); // correct - c = a + b; // compiler error: no operator "+" matches these operands -} -``` -CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). - -Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: -``` -typedef struct __CUDA_ALIGN__(2) { - unsigned short x; -} __half_raw; - - -struct __CUDA_ALIGN__(2) __half { -protected: - unsigned short __x; -public: - // constructors and conversion operators from/to - // __half_raw and other built-in data types -} - -typedef __half half; - -__device__ __forceinline__ -__half operator+(const __half &lh, const __half &rh) { - return __hadd(lh, rh); -} - -// Other overloaded operators -``` -This new design makes `c = a + b` work correctly for CUDA half data type. - -## Implementation -The float16 class holds a 16-bit `uint16_t` data internally. -``` -struct float16 { - uint16_t x; -}; -``` - -float16 supports the following features: - - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. - - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen. - - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. - - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. - -To support the above features, two fundamental conversion functions are provided: -``` -float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode -float half_to_float(float16 h); -``` -which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion. - -## float16 inference -In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one. - -### Operator level requirement -Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output. - -This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type. - -The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable. - -So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator. - -### Variable level requirement -Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type. - -When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights. - -In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h). - -The following code demonstrates how to do the tensor conversion. -```Python -# var is the variable of float weights -# tensor is a numpy array of data copied from the tensor data in var -# fp16_var is the variable that will contain float16 weights converted from var -tensor = numpy.array(var.get_tensor()) -fp16_tensor = fp16_var.get_tensor() - -# After the original tensor data is converted to numpy float16 data type, -# view(numpy.uint16) is used so that the internal memory of the numpy array -# will be reinterpreted to be of uint16 data type, which is binded to -# Fluid float16 class via pybind with the help of uint16_t built-in c++ type -fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace) -``` - -### Consistent API requirement -The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type. - -This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode. - -### float16 transpiler -Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one. - -Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md), -this transpiler mainly does the following modifications: - -1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel. - -2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result. - -3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program. - -4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart. - -Below is an example of usage: -```Python -# Get the float inference program -[float_inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) - -# Prepare the float input data -tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32) - -# Running inference_program in float mode -float_results = exe.run(float_inference_program, - feed={feed_target_names[0]: tensor_img}, - fetch_list=fetch_targets) - -# Use float16 transpiler to speedup -float16_inference_program = float_inference_program.clone() -t = fluid.InferenceTranspiler() -t.float16_transpile(float16_inference_program, GPUPlace) - -# Running -float16_results = exe.run(float16_inference_program, - feed={feed_target_names[0]: tensor_img}, - fetch_list=fetch_targets) -``` - -As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode. - -### Speedup on GPU -Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. - -Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0. - -Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details. diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst deleted file mode 100644 index b60167b6b1599df69dfc5073ebf32bdbb0a316ec..0000000000000000000000000000000000000000 --- a/doc/fluid/design/data_type/index_cn.rst +++ /dev/null @@ -1,7 +0,0 @@ -数据类型 ------------- - -.. toctree:: - :maxdepth: 1 - - float16.md diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst deleted file mode 100644 index 6a88d17943f49134a2d00363845e919537ff4545..0000000000000000000000000000000000000000 --- a/doc/fluid/design/data_type/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -Data Type ------------- - -.. toctree:: - :maxdepth: 1 - - float16.md diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md deleted file mode 100644 index 2dd652d8bdcb8f3b6e759347bd55b217be909386..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/README.md +++ /dev/null @@ -1,57 +0,0 @@ -## Distributed training overview doc - -Currently Paddle Fluid use parameter server architecture to support distributed training. - -For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training. - -### Synchronous training - -The training process of synchronous training is: - -![synchronous distributed training](./src/sync_distributed_training.png) - -1. Pserver - 1. set `barrier_condition_` to 0 and waits for trainers to send gradient. -1. Trainer - 1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters. - 1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time. - 1. Trainer use send_op to send all the split gradients to corresponding parameter server. - 1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers. - 1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1. -1. Pserver - 1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`. - 1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers. - 1. Pserver will run the optimization block to optimize the parameters. - 1. After optimization, pserver set `barrier_condition_` to 1. - 1. Pserver wait for `FETCH_BARRIER_MESSAGE`. -1. Trainer. - 1. The trainer uses GetVariable to get all the parameters from pserver. - 1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver. -1. Pserver. - 1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0. - -### Asynchronous training -In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back. - -The training process of asynchronous training can be: - -![asynchronous distributed training](./src/async_distributed_training.png) - -1. Pserver: - 1. Each parameter has a queue to receive its gradient from trainers. - 1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter. - 1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.) - -1. Trainer: - 1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters. - 1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue). - 2. Trainer gets all parameters back from pserver. - -### Note: -There are also some conditions that need to consider. For exmaple: - -1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back. -1. If we need a lock between parameter update and parameter fetch. -1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers. - -The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems. diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md deleted file mode 100644 index 248d2ec18dafdecac9184527638754b6ba4d85b8..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/async_update.md +++ /dev/null @@ -1,61 +0,0 @@ -# Design Doc: Asynchronous Update With Distributed Training - -## Background - -For the typical synchronous distributed training, some significant steps are as follows: - -1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes. -1. After the PS node received gradients came from all the Trainers, It will aggregate the -gradient variables for the same parameter into one gradient variable and then apply the aggregated -gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...) -to update the parameters. -1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS, -so all the Trainers would get the same parameters. - -In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes -have completed running current mini-batch. After that, all trainers can continue to run the next -mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends -on the slowest node. - -In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on -the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would -train such models that achieve scaling, better throughput. In this design doc, we will introduce how to -implement the Asynchronous Distributed Training base on PaddlePaddle Fluid. - -## Design - - - -As the figure above, we describe a global view of the asynchronous update process and use -the parameter `w1` as an example to introduce the steps: -1. For each gradient variables, they may distribute on different GPU card and aggregate -them while they are all calculated. -1. Split the gradient variable into multiple blocks according to the number of PS -instances and then send them. -1. PS would run an `Optimize Block` using a specified optimize algorithm to update -the specified parameter. -1. The trainer will fetch the latest parameter from PS before running forward Op which depends -on the specified parameter. -1. Broadcast the received variable into multiple GPU cards and continue to run the next -mini-batch. - -### Trainer - -- For the multiple devices distributed training, we need to aggregate the gradient -variables which placed on different devices firstly and then schedule a `SendVars` Operator to -send the gradient variables to the multiple PS instances. -- Schedule `FetchVars` operator to fetch the latest parameter from PS before running -the forward ops. -- There could be a large number of gradient variables to be sent, so we need to use another -thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the -computing thread pool to avoid competitive the thread resources with computing. - -### Parameter Server - - - -- There should be multiple trainer instances want to optimize the same parameter at -the same time, to avoid the racing, we need one `BlockingQueue` for each gradient -variable to process them one by one. -- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which -can optimize the respective parameter. diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md deleted file mode 100644 index b8b8427811cddcddf872db5badfd37c96a76c3e3..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/dist_train_nccl2.md +++ /dev/null @@ -1,35 +0,0 @@ -# Distributed Training with NCCL2 - -We design a pattern that can enable training with `ParallelExecutor` and -use [NCCL2](https://developer.nvidia.com/nccl) as it's collective -communication library. - -In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast` -to do multi GPU training. And if we initialize NCCL2 communicators as -ranks in a distributed environment, we can simply run the `ParallelExecutor` -as a distributed program! The only thing that may be different than in -the single node version is that we need to broadcast the NCCL unique ID -to all the nodes and initialize communicators using that ID, so NCCL2 -can know each other as ranks. - -To achieve this feature, we introduce a new operator: `gen_nccl_id` op, -so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in -whatever platform you like. - -It has two running modes: - -1. Generate and broadcast mode, which should be used on trainer 0; -1. Listen and fetch mode, which should be used on trainers other than 0. - -In both two modes, this op can save the NCCL ID into current scope as a -persistable variable, Then we can insert this op at the end of -"startup program" of fluid, so that all workers can get the same ID to -initialize NCCL communicator objects. - - - -The above figure indicates the general process when training with NCCL2 -distributed. Each trainer has the number of communicators equal to the -number of GPUs, but the ranks should match the global ranks number: here -we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should -be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1. diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md deleted file mode 100644 index 371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/distributed_architecture.md +++ /dev/null @@ -1,197 +0,0 @@ -# Design Doc: Fluid Distributed Training Architecture - -## Abstract - -PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: - -1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. - -2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. - -3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. - -This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. - -## Analysis - -The assumption is that the user writes the trainer program in either Python or C++. - -### Limitation 1 - -There are two basic functionalities in the trainer program: - -1. The training logic such as loading / saving the model and printing out the logs. -2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the - optimizer. - -When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the -training logic as well as the neural network computation logic, is replicated. - -The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** -replicate the training logic, the limitation mentioned above can be avoided. - -### Limitation 2 - -Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the -inter-model-shard communication between nodes. - -PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the -computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. - -Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: - - - -PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: - - - -The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. - -### Limitation 3 - -The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. - -This could be fixed by making the parameter server also run an IR, which can be different to the trainer side -For a detailed explanation, refer to this document - -[Design Doc: Parameter Server](./parameter_server.md) - -## Distributed Training Architecture - -The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: - - - -The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*. - -### Python API - -Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc. - -```Python -images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') -label = fluid.layers.data(name='label', shape=[1], dtype='int64') -... -predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax") -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(x=cost) -optimizer = fluid.optimizer.Adam(learning_rate=0.01) -optimizer.minimize(avg_cost) - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=500), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) - -for pass_id in range(10): - for data in train_reader(): - loss, acc = exe.run(trainer_prog, - feed=feeder.feed(data), - fetch_list=[avg_cost]) -``` - -The code above is a typical local training program, the "Training Program" is built using helper functions such as -`fluid.layer.fc`. The training is done by calling `Executor.run` -iteratively. - -For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type. - -[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use -`Executor` to run the program locally. For any kind of distributed training, you can use -`RemoteExecutor` to specify desired distributed training method with some optional arguments. - -### Distributed Transpiler - -The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then -the Remote Executor dispatches the new IRs to Remote Executors across the cluster. -Below are the steps that are followed : - -1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program. -1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a - distributed training program: - 1. Parse configurations from `RemoteExecutor`. - 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming. - 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take - DataParallelism type for example, it removes the optimization operators and add a `send` OP to the - "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP. -1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster. -1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end. - - -### RemoteExecutor - -As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution. -You can also use parameter `fetch_list` to interactively fetch variable back to local for -log printing. - -The Python `RemoteExecutor` is derived from `Executor` class. - -```python -exe = RemoteExecutor( - feed=feeder.feed(data), - fetch_list=[avg_cost], - job_desc=JobDesc( - jobname, - num_trainer, - num_pserver, - cpu_per_trainer, - gpu_per_trainer, - mem_per_trainer, - cpu_per_pserver, - mem_per_pserver - )) -for data in train_reader(): - loss, acc = exe.run(trainer_prog, - feed=feeder.feed(data), - fetch_list=[avg_cost]) -``` - -`JobDesc` object describe the distributed job resource specification to run on -Cluster environment. - - - -`RemoteExecutor.run` sends the `ProgramDesc` and -[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource) -to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible -to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`. - - -### Placement Algorithm - -Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. - -In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. - - -### Local Training Architecture - -The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: - - - - -### Training Data - -In PaddlePaddle v0.10.0, training data is typically read -with [data reader](./README.md) from Python. This approach is -no longer efficient when training distributedly since the Python -process no longer runs on the same node with the trainer processes, -the Python reader will need to read from the distributed filesystem -(assuming it has the access) and send to the trainers, doubling the -network traffic. - -When doing distributed training, the user can still use Python data -reader: the training data are sent with `Executor.run`. However, should -be used for debugging purpose only. The users are encouraged to use -the read data OPs. - - -## References: - -[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) - -[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf) diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md deleted file mode 100644 index e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md +++ /dev/null @@ -1,89 +0,0 @@ -# Design Doc: Distributed Lookup Table Operator - -A distribute lookup table operator in PaddlePaddle where the table could be out -of the memory of a computer. - -## Background - -A lookup table operator is well-used in deep learning for learning the -representation, or the -[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of -symbols. - -### The Forward Algorithm - -The forward algorithm of the lookup table is a multiplication of the -input vector x and the lookup table matrix W: - -$$y = x * W$$ - -When x is a sparse vector of symbols, the above multiplication -simplifies into looking up rows in W that correspond to symbols in x, -denoted by W(x). Please be aware that W could be huge and out of the -memory, so we'd need a distributed storage service, which supports the -lookup of rows. - -The following figure illustrates the multiplication of x with two -non-zero elements, or say two symbols, and a lookup table W: - -![lookup table](./src/lookup_table.png) - -### The Backward Algorithm - -The backward algorithm computes W'(x) using W(x). W'(x) has the same -the scale of size as W(x) and is much smaller than W. - -To optimize W given W', we can do simple SGD update: - -$$W = f(W') = \lambda * W'$$ - -or some more sophisticated algorithms that rely on both W' and W: - -$$W = f(W, W')$$ - -The following figure illustrates the backward pass of the lookup -operator: ![lookup table training](./src/lookup_table_training.png) - -## Distributed Lookup Table -### Problem 1: The lookup table may be very large. - - In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is: - - ``` - 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB - ``` - -### Solution: Distributed storage - -1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter. - -1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer. - -### Problem 2. The Id in the lookup table is not sure before training. - - The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training. - -### Solution: Id auto growth - -At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it. - -### Problem 3: parameter load and save - -For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size. - -### Solution: Parameter server side save and load - -Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table. - -## Architecture -The whole architecture of the distribute lookup table is as below: - -### Training steps: -1. Read a batch of data, the data is feature ids. -1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table. -1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table. -1. Run forward-backward to get the gradient of the lookup table. -1. `split_ids_op` split the gradient and then use `send_op` to the parameter server. -1. parameter server update the table with the received gradient. - -![distribute lookup table](./src/distributed_lookup_table.jpeg) diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md deleted file mode 100644 index c09b7c99159ace9b3df989f803ede20bc3585d92..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/distributed_traing_review.md +++ /dev/null @@ -1,44 +0,0 @@ -# Parallelism, Asynchronous, Synchronous, Codistillation - - -For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches. - -# Model Parallelism -In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution. - -A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers. - -More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources. - -# Data Parallelism -Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable. - -# Asynchronous Training -In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested. - -In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster. - -Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster. - -# Synchronous Training -Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters. - -An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse). - -Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed. - -Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality. - -# Codistillation -Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4] - - -# Reference - -[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks. - -[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD. - -[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation. - -[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst deleted file mode 100644 index ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -分布式训练 ------------- - -.. toctree:: - :maxdepth: 1 - - distributed_architecture.md - distributed_lookup_table_design.md - parameter_server.md diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst deleted file mode 100644 index f84688f168021113bd933802709bcd787b474bca..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -Distributed Training ---------------------- - -.. toctree:: - :maxdepth: 1 - - distributed_architecture.md - distributed_lookup_table_design.md - parameter_server.md diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md deleted file mode 100644 index 4ad3afc7b7522c60460c6f1f387f9415d3738778..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/mpi_enabled_design.md +++ /dev/null @@ -1,46 +0,0 @@ -# MPI-enabled PaddlePaddle Design doc - -# Background -When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons: -1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead. -2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices. -3. TCP connections can not make full use of RDMA 100Gb devices. - -We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle: -1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks. -2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write. - -# Change list -* Compile args: Need add compile args to enable MPI support. -* Execute args: Need add execute args to assign when and how to use MPI operations. -* New ops: Need new op ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive. -* Transpiler optimized: Which can add ```mpi_send_op``` and ```mpi_listenandserve_op``` to the running graph. -* MPI utils package: Need MPI utils package as the low-level API supported. - -## Compile args -Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI``` compile args to control MPI to use or not. If the ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling. - -## Execute args -Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer), The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations. - -## New ops -We won't replace all the gRPC requests to MPI requests, the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives, the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process. - -### mpi_send_op -Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```. - -### mpi_listenandserve_op -Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```. - -## Transpiler optimized -**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.** - if confirm to use MPI, we will modify ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also. - -## MPI utils package -In this package, We will write openMPI low-level API to use MPI. -The API included in this package are: -* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op. -The detailed flow is below: -![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png) -* MPI global configurations, which store the Rank ID and the mapping in global variables, for example: -gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ``` diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md deleted file mode 100644 index 38222d083084ebfca3099ce96b47868c42d55101..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/multi_cpu.md +++ /dev/null @@ -1,43 +0,0 @@ -# Design Doc: Execute the Program with Multi CPU - -## Abstract - -This Design Doc propose an approach to make the user-defined Op graph -running with multi-CPU, we will use an auto transpiler to convert the user-defined -Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph. - -## Transpiler - - - -After converted: - - - -## Implement - -- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph - which would be executed with multi-threads. -- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait` - for the atomic counter become `0`: - ```cpp - BlockingCounter bc(thread_count); - for (int i = 0; i < thread_count; ++i) { - thread_pool->Start([&bc] {bc.DecrementCount(); }) - } - bc.Wait(); - ``` -- `ParallelDo` Operator - - Initialize a thread pool which is a Singleton. - - Use a block id as the input, and create run the specify Block on independent scope - with multi-threads. - - Initialize a `BlockingCounter` instance and wait until all threads are done. -- `Split` Operator will split the Input Tensor into a TensorArray. -- `Merge` merge all the gradients which calculated in different threads - with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`. - -## TODO - -- Improve the optimizer stage with multi-threads, since we could - assign the parameters to the different threads and execute - optimizer with multi-threads. diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md deleted file mode 100644 index 563b70bc0e852bec953eb40dda3c46b3d45d7e68..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dist_train/parameter_server.md +++ /dev/null @@ -1,106 +0,0 @@ -# Design Doc: Parameter Server - -## Abstract - -We propose an approach to implement the parameter server. In this -approach, there is no fundamental difference between the trainer and -the parameter server: they both run subgraphs, but subgraphs of -different purposes. - -## Background - -The previous implementations of the parameter server do not run a -fluid sub-program. Parameter initialization, optimizer computation, network -communication and checkpointing are implemented twice on both the -trainer as well as the parameter server. - -It would be great if we can write code once and use them on both: the -trainer and the parameter server, since this reduces code duplication and -improves extensibility. Given that after the current refactoring, we are -representing everything as a computation graph on the -trainer. Representing everything as a computation graph on the parameter -server becomes a natural extension. - -## Design - -### Distributed Transpiler - -The *Distributed Transpiler* converts the user-defined fluid program -into sub-programs to be scheduled on different nodes with the following -steps: - -1. OP placement: the OPs will be placed on different nodes according - to a heuristic that minimizes the estimated total computation - time. Currently we will use a simple heuristic that puts parameter - variable on parameter server workers and everything else on trainer - workers. -1. Add communication OPs to enable the communication between nodes. - -We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. - -Below is an example of converting the user defined graph to the -subgraphs for the trainer and the parameter server: - - - -After converting: - - - -1. The parameter variable W and its optimizer program are placed on the parameter server. -1. Operators are added to the program. - - *Send* sends data to the connected *Recv* operator. The - scheduler on the receive node will only schedule *Recv* operator - to run when the *Send* operator has ran (the *Send* OP will mark - the *Recv* OP runnable automatically). - - *Enqueue* enqueues the input variable, it can block until space - become available in the queue. - - *Dequeue* outputs configurable numbers of tensors from the - queue. It will block until the queue has the required number of - tensors. - -### Sparse Update - -For embedding layers, the gradient may have many rows containing only 0 when training, -if the gradient uses a dense tensor to do parameter optimization, -it could spend unnecessary memory, slow down the calculations and waste -the bandwidth while doing distributed training. -In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing -non-zero gradient data. So when we do parameter optimization both locally and remotely, -we only need to send those non-zero rows to the optimizer operators: - - -### Benefits - -- Model parallelism becomes easier to implement: it is an extension to - the trainer - parameter server approach. We can have several "Transpilers" - to achieve different goals. -- User-defined optimizer is easier to add - user can now express it as - a sub-program. -- No more duplication logic inside the trainer and the parameter - server mentioned in the background section. - -### Challenges - -- It is important to balance the parameter shards on multiple - parameter servers. If a single parameter is very big (for example: some - word-embedding, fully connected, softmax layer), we need to - automatically partition the single parameter onto different - parameter servers when possible (only element-wise optimizer depends - on the parameter variable). -- In the "Async SGD" figure, the "W" variable on the parameter server - could be read and written concurrently. See - [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more - details about concurrent program in Fluid. - -### Discussion - -- Can the Enqueue OP be implemented under our current tensor design - (put the input tensor into the queue tensor)? -- *Dequeue* OP will have variable numbers of output (depending on the - `min_count` attribute), does our current design support it? (similar - question for the *Add* OP) - -### References - -[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png deleted file mode 100644 index 3b53ab59c0cd7b44b2956f16f1adc47fe85909d3..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/async_distributed_training.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle deleted file mode 100644 index d2301611774fcb3866473e3e6470568d1e1312cf..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/async_pserver.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png deleted file mode 100644 index 7d900b0c0eb291c67537b9cf93227c671bafdc73..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/async_pserver.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle deleted file mode 100644 index 3a631888688a0d564a873fcb16d943958c91223e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/async_update.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png deleted file mode 100644 index 3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/async_update.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle deleted file mode 100644 index 8cc678fea3c820103e7ce81f7a5d625d6c1d92de..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/compiler.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png deleted file mode 100644 index 65d34f841afce9756def07dd8ecb9ca44e658bfe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/compiler.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle deleted file mode 100644 index 941399c6ced8d5f65b6c595522b770c88259df4b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/dist-graph.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png deleted file mode 100644 index 3546b09f1c2ee3e4f60f519d5e47f823f08051a7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/dist-graph.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle deleted file mode 100644 index d1b60141342232e06227c2d430ebc60ec349a907..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/distributed_architecture.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png deleted file mode 100644 index 29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/distributed_architecture.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle deleted file mode 100644 index 65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg deleted file mode 100644 index 5353a16fd329f62ff893d32706b9c3c0bcc46a07..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle deleted file mode 100644 index 1168801bc1fadfce310a74cb3110695bd1629f6b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/distributed_training.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle deleted file mode 100644 index 96ca6d48f43bd9f49c6861dab006e2037873db87..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png deleted file mode 100644 index afa25ab3b4e427bc595a855b12ab966478e01ed0..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle deleted file mode 100644 index 19e509bd9af3c1e9a3f5e0f16ddd281457a339c5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/local-graph.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png deleted file mode 100644 index ada51200f793a9bb18911e7d63cfdb3244b967d7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/local-graph.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle deleted file mode 100644 index 49fcc663ebe3824aa234e3a67aadf285cb417877..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/local_architecture.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png deleted file mode 100644 index 14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/local_architecture.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png deleted file mode 100644 index 72dfe3547f731d0d090338afb206b0549dff472e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/lookup_table.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png deleted file mode 100644 index cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/lookup_table_training.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png deleted file mode 100644 index e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/mpi_module.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle deleted file mode 100644 index e71173715fff92a0a933d0c7d83599ba948552c6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/multi-threads.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png deleted file mode 100644 index e40a869987dbbf5019d4cb03c1dab55b74d6c9f9..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png deleted file mode 100644 index 4083aebfdd45af5fbac25fa2c4176bc08c3cb44a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle deleted file mode 100644 index 7d2753bbb03bc28c7a0054bb0aa424deb072ffbf..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/ncc2_design.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png deleted file mode 100644 index da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/ncc2_design.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle deleted file mode 100644 index a6348cc3dbcaca923c6e794681b2edb85cb9f8f6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/paddle-compile.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png deleted file mode 100644 index e0f13d551ac41afaec627a57dea79356464bf0bf..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/paddle-compile.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle deleted file mode 100644 index 41b2067311694b56d211a4f32d1b76884eeffd2d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/remote_executor.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png deleted file mode 100644 index 744e2fb2e0f1bbe058e991ba7b2a09000965ee79..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/remote_executor.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle deleted file mode 100644 index 08d689a58f83698d8c1158ee3990ed8abf3a7a9a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/sparse_update.graffle and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png deleted file mode 100644 index 8c872e6ac479f7d1b818a4a207956c43155d0ad7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/sparse_update.png and /dev/null differ diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png deleted file mode 100644 index e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dist_train/src/sync_distributed_training.png and /dev/null differ diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot deleted file mode 100644 index 5d77865061ca7bbbfcf254dd938f09aef5553505..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot +++ /dev/null @@ -1,56 +0,0 @@ -digraph G { - - rnn [label="1st level RNN" shape=box] - - subgraph cluster0 { - label = "time step 0" - - sent0 [label="sentence"] - sent1 [label="sentence"] - - rnn1 [label="2nd level RNN" shape=box] - - sent0 -> rnn1 - sent1 -> rnn1 - } - - subgraph cluster1 { - label = "time step 1" - - sent2 [label="sentence"] - sent3 [label="sentence"] - - rnn2 [label="2nd level RNN" shape=box] - - sent2 -> rnn2 - sent3 -> rnn2 - } - - subgraph cluster2 { - label = "time step 2" - - sent4 [label="sentence"] - sent5 [label="sentence"] - - rnn3 [label="2nd level RNN" shape=box] - - sent4 -> rnn3 - sent5 -> rnn3 - } - - - para0 [label="paragraph info 0"] - para1 [label="paragraph info 1"] - para2 [label="paragraph info 2"] - - rnn1 -> para0 - rnn2 -> para1 - rnn3 -> para2 - - para0 -> rnn - para1 -> rnn - para2 -> rnn - - chapter [label="chapter info"] - rnn -> chapter -} diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png deleted file mode 100644 index 0537a75beb175c0c284717421f7aa908da2a5038..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dynamic_rnn/2_level_rnn.png and /dev/null differ diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst deleted file mode 100644 index 1d224d22cf7103616f44115db01f0ae55f1cb88a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -动态RNN ------------- - -.. toctree:: - :maxdepth: 1 - - rnn.md - rnn_design.md diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst deleted file mode 100644 index 568f496e4ffe21a5e730488aef905f7e2d98839e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/index_en.rst +++ /dev/null @@ -1,8 +0,0 @@ -Dynamic RNN ------------- - -.. toctree:: - :maxdepth: 1 - - rnn.md - rnn_design.md diff --git a/doc/fluid/design/dynamic_rnn/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot deleted file mode 100644 index c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/rnn.dot +++ /dev/null @@ -1,87 +0,0 @@ -digraph G { - label = "simple RNN implementation" - - ranksep=2; - - //graph [nodesep=1, ranksep=1]; - - node[nodesep=1] - - subgraph cluster0 { - label = "global scope" - rankdir = TB - W - boot_memory - input - output - } - - subgraph cluster1 { - label = "step-scope 0" - rankdir = TB - memory0[label="memory"] - prememory0[label="pre-memory"] - step_input0[label="step input"] - step_output0[label="step output"] - } - - subgraph cluster2 { - label = "step-scope 1" - rankdir = TB - memory1[label="memory"] - prememory1[label="pre-memory"] - step_input1[label="step input"] - step_output1[label="step output"] - } - - subgraph cluster3 { - label = "step-scope 2" - rankdir = TB - memory2[label="memory"] - prememory2[label="pre-memory"] - step_input2[label="step input"] - step_output2[label="step output"] - } - - stepnet [shape=box] - stepnet0 [shape=box, style=dashed] - stepnet1 [shape=box, style=dashed] - stepnet2 [shape=box, style=dashed] - - - edge[color=blue] - boot_memory -> prememory0 [label="init" color="blue"] - memory0 -> prememory1 [label="copy/reference" color="blue"] - memory1 -> prememory2 [label="copy/reference" color="blue"] - - edge[color=black] - W -> stepnet0[constraint=false, style=dashed] - W -> stepnet1[constraint=false, style=dashed] - W -> stepnet2[constraint=false, style=dashed] - - memory0 -> stepnet0[style=dashed] - prememory0 -> stepnet0 -> step_output0[style=dashed] - - memory1 -> stepnet1[style=dashed] - prememory1 -> stepnet1 -> step_output1[style=dashed] - - memory2 -> stepnet2[style=dashed] - prememory2 -> stepnet2 -> step_output2[style=dashed] - - input -> step_input0 - input -> step_input1 - input -> step_input2 - - step_input0 -> stepnet0 [style=dashed] - step_input1 -> stepnet1[style=dashed] - step_input2 -> stepnet2[style=dashed] - - step_output0 -> output - step_output1 -> output - step_output2 -> output - - stepnet0 -> stepnet[style=dashed] - stepnet1 -> stepnet[style=dashed] - stepnet2 -> stepnet[style=dashed] - -} diff --git a/doc/fluid/design/dynamic_rnn/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg deleted file mode 100644 index 9867e404cf959df0dce6ded5222b466c788fb840..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dynamic_rnn/rnn.jpg and /dev/null differ diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md deleted file mode 100644 index b39ae0675c45e56852293d97f45e91861cf31667..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/rnn.md +++ /dev/null @@ -1,153 +0,0 @@ -# RNNOp design - -This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. - -## RNN Algorithm Implementation - -

- -

- -The above diagram shows an RNN unrolled into a full network. - -There are several important concepts here: - -- *step-net*: the sub-graph that runs at each step. -- *memory*, $h_t$, the state of the current step. -- *ex-memory*, $h_{t-1}$, the state of the previous step. -- *initial memory value*, the memory of the first (initial) step. - -### Step-scope - -There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. - -

-
-Figure 2 illustrates the RNN's data flow -

- -Please be aware that every step runs the same step-net. Each step does the following: - -1. Creates the step-scope. -2. Initializes the local variables including step-outputs, in the step-scope. -3. Runs the step-net, which uses the above mentioned variables. - -The RNN operator will compose its output from step outputs in each of the step scopes. - -### Memory and Ex-memory - -Let's give more details about memory and ex-memory using a simple example: - -$$ -h_t = U h_{t-1} + W x_t -$$, - -where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. - -In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, -or copy the memory value of the previous step to the current ex-memory variable. - -### Usage in Python - -For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md). - -We can define an RNN's step-net using a Block: - -```python -import paddle as pd - -X = some_op() # x is some operator's output and is a LoDTensor -a = some_op() - -# declare parameters -W = pd.Variable(shape=[20, 30]) -U = pd.Variable(shape=[20, 30]) - -rnn = pd.create_rnn_op(output_num=1) -with rnn.stepnet(): - x = rnn.add_input(X) - # declare a memory (rnn's step) - h = rnn.add_memory(init=a) - # h.pre_state(), the previous memory of rnn - new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) - # update current memory - h.update(new_state) - # indicate that h variables in all step scopes should be merged - rnn.add_outputs(h) - -out = rnn() -``` - -Python API functions in above example: - -- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. -- `rnn.add_memory`: creates a variable used as the memory. -- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. - -### Nested RNN and LoDTensor - -An RNN whose step-net includes other RNN operators is known as an *nested RNN*. - -For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. - -The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. - -

- -

- -```python -import paddle as pd - -W = pd.Variable(shape=[20, 30]) -U = pd.Variable(shape=[20, 30]) - -W0 = pd.Variable(shape=[20, 30]) -U0 = pd.Variable(shape=[20, 30]) - -# a is output of some op -a = some_op() - -# chapter_data is a set of 128-dim word vectors -# the first level of LoD is sentence -# the second level of LoD is a chapter -chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) - -def lower_level_rnn(paragraph): - ''' - x: the input - ''' - rnn = pd.create_rnn_op(output_num=1) - with rnn.stepnet(): - sentence = rnn.add_input(paragraph, level=0) - h = rnn.add_memory(shape=[20, 30]) - h.update( - pd.matmul(W, sentence) + pd.matmul(U, h.pre_state())) - # get the last state as sentence's info - rnn.add_outputs(h) - return rnn - -top_level_rnn = pd.create_rnn_op(output_num=1) -with top_level_rnn.stepnet(): - paragraph_data = rnn.add_input(chapter_data, level=1) - low_rnn = lower_level_rnn(paragraph_data) - paragraph_out = low_rnn() - - h = rnn.add_memory(init=a) - h.update( - pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) - top_level_rnn.add_outputs(h) - -# output the last step -chapter_out = top_level_rnn(output_all_steps=False) -``` - -In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. - -By default, the `RNNOp` will concatenate the outputs from all the time steps. -If the `output_all_steps` is set to False, it will only output the final time step. - - -

- -

diff --git a/doc/fluid/design/dynamic_rnn/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png deleted file mode 100644 index e139e373fe8396782044cfd936fdde624f8c66fe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dynamic_rnn/rnn.png and /dev/null differ diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot deleted file mode 100644 index 1d85ae2617a915ad0ad8288d848b607cc37ad297..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot +++ /dev/null @@ -1,75 +0,0 @@ -digraph G { - chapter [label="chapter"] - - subgraph cluster0 { - label = "paragraph 0" - - top_rnn0[label="top rnn step 0" shape=box] - - p0 [label="paragraph 0"] - p1 [label="paragraph 1"] - } - - subgraph cluster1{ - label = "paragraph 1" - - top_rnn1[label="top rnn step 1" shape=box] - - p2 [label="paragraph 0"] - p3 [label="paragraph 1"] - } - - subgraph cluster_p0 { - label = "sentence 0" - - low_rnn0 [label="low rnn step 0" shape=box] - s00 [label="sentence 0"] - s01 [label="sentence 1"] - - low_rnn0 -> s00 - low_rnn0 -> s01 - } - - subgraph cluster_p1 { - label = "sentence 1" - low_rnn1 [label="low rnn step 1" shape=box] - s10 [label="sentence 0"] - s11 [label="sentence 1"] - low_rnn1 -> s10 - low_rnn1 -> s11 - } - - subgraph cluster_p2 { - label = "sentence 1" - low_rnn2 [label="low rnn step 0" shape=box] - s20 [label="sentence 0"] - s21 [label="sentence 1"] - low_rnn2 -> s20 - low_rnn2 -> s21 - } - - subgraph cluster_p3 { - label = "sentence 1" - low_rnn3 [label="low rnn step 1" shape=box] - s30 [label="sentence 0"] - s31 [label="sentence 1"] - low_rnn3 -> s30 - low_rnn3 -> s31 - } - - - chapter -> top_rnn0 - chapter -> top_rnn1 - - top_rnn0 -> p0 - top_rnn0 -> p1 - top_rnn1 -> p2 - top_rnn1 -> p3 - - - p0 -> low_rnn0 - p1 -> low_rnn1 - p2 -> low_rnn2 - p3 -> low_rnn3 - -} diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png deleted file mode 100644 index 4be81b2430717a6a506342a09fc26899568574c6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png and /dev/null differ diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md deleted file mode 100644 index cecfcd3307ae4c4fa603220a360e9e124069fa58..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/rnn_design.md +++ /dev/null @@ -1,242 +0,0 @@ -# RNN 变长输入设计 -对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式, -即将一个mini-batch内不同长度的序列补0到固定长度参与计算。 - -现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。 - -## 背景介绍 -由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时, -必须用zero-padding的方式将变长序列补全为固定shape的tensor。 - -由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在, -因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。 - -由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2], -但不管是padding还是bucket,对于用户都是额外的使用负担。 - -因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。 - -但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。 - -## 多层序列数据格式 `LODTensor` -目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上, -额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。 - -Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持; - -为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构: - -```c++ -std::shared_ptr>> lod_start_pos_; -``` - -或者更明确的定义 - -```c++ -typedef std::vector level_t; -std::vector lod_start_pos; -``` - -这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。 - -为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4], -其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。 -如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用, -而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。 - -`LODTensor` 具体定义如下: - -```c++ -class LODTensor : public Tensor { -public: - size_t Levels() const { return seq_start_positions_.size(); } - size_t Elements(int level = 0) const { - return seq_start_positions_[level].size(); - } - // slice of level[elem_begin: elem_end] - // NOTE low performance in slice seq_start_positions_. - // TODO should call Tensor's Slice. - LODTensor LODSlice(int level, int elem_begin, int elem_end) const; - - // slice with tensor's data shared with this. - LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const; - - // copy other's lod_start_pos_, to share LOD info. - // NOTE the LOD info sould not be changed. - void ShareConstLODFrom(const LODTensor &other) { - lod_start_pos_ = other.lod_start_pos_; - } - // copy other's lod_start_pos_'s content, free to mutate. - void ShareMutableLODFrom(const LODTensor &other) { - lod_start_pos_ = std::make_shared < - std::vector>(other.lod_start_pos_.begin(), - other.lod_start_pos_.end()); - } - -private: - std::shared_ptr>> lod_start_pos_; -}; -``` - -其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价, -可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。 - -## 框架支持 -### 框架现有的 `Tensor` 调用替换为 `LODTensor` -为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`, -简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。 - -此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。 - -### `lod_start_pos` 随着Op调用链传递 -框架需要支持下列特性,以实现`lod_start_pos`的传递: - -1. 以 `shared_ptr` 的方式实现传递 - - 不修改 `lod_start_pos` 内容的作为 consumer - - 修改 `lod_start_pos` 的作为 producer - - 约定 consumer 只需要复制传递过来的 `shared_ptr` - - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer - - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos` - -2. 对于不感知 `lod_start_pos` 的Op足够透明 -3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 - -具体的设计分为以下3小节 - -#### `load_start_pos` 的传递 - -- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制 -- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改 - -#### 框架透明 -传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下 - -- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false` - - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true` -- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。 -- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次 - -一些逻辑如下 - -```c++ -class OperatorBase { -public: - // ... - void InferShape() { - if (!is_load_inited) { - bool do_mutate_lod_info = GetAttr("do_mutate_load_info"); - // find a input having LOD to copy - auto lod_input = ValidLODInput(); - for (auto &output : outputs) { - if (do_mutate_load_info) { - output.ShareMutableLODFrom(lod_input); - } else { - output.ShareConstLODFrom(load_input); - } - } - is_pod_inited = true; - } - - // call op's InferShape - // ... - } - -private: - // ... - bool is_lod_inited{false}; -}; -``` - -如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。 - -#### `lod_start_pos` 的更新 -上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改, -Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` , -而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。 - -## 根据长度排序 -按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算 - -比如原始的输入: - -``` -origin: -xxxx -xx -xxx - --> sorted: -xxxx -xxx -xx -``` - -经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列) - -``` -0 1 2 3 -x x x x -x x x -x x -``` - -为了追踪排序前后序列的变化,这里用 -```c++ -struct SortedSeqItem { - void *start{nullptr}; - void *end{nullptr}; -}; - -std::vector sorted_seqs; -``` -来追踪序列排序后的位置,并添加一个新的接口 - -```c++ -std::vector SortBySeqLen(const LODTensor& tensor); -``` - -由于输入序列的顺序变化,以下现有的接口需要针对性地修改: - -- InitMemories, memory需要根据 `sorted_seqs` 重新排列 -- SetmentInputs -- ConcatOutputs - -此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出, -之后作为 `RecurrentGradientOp` 的一个输入传入。 - -## InitMemories -由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。 - -## SegmentInputs -`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。 - -即下面的转变: -``` -origin: -xxxx -xx -xxx - - | - | - \ / - ! -0 1 2 3 -x x x x -x x x -x x -``` -## ConcatOutputs -`ConcatOutputs` 需要 - -- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱) -- 将每个序列concat 为规则的mini-batch表示 - -## 参考文献 -[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) - -[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html) - -[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) - -[Level of details](https://en.wikipedia.org/wiki/Level_of_detail) diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md deleted file mode 100644 index 9493908f4f73b3e7d91f5f6364a2a3660257d508..0000000000000000000000000000000000000000 --- a/doc/fluid/design/dynamic_rnn/rnn_design_en.md +++ /dev/null @@ -1,175 +0,0 @@ -# Varient Length supported RNN Design -For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding. - -Different-length sequences in a mini-batch will be padded with zeros and transformed to same length. - -The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, -which supports the variable length sequences without padding. -This doc will design fluid's RNN based on this idea. - -## Multi-layer sequence data format `LODTensor` -At present, Paddle stores data in one mini-batch in one-dimensional array. - -`Argument.sequenceStartPositions` is used to store information for each sentence. - -In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported. - -In order to support the storage of `N-level` sequences, we define sequence information as the following data structure. - - -```c++ -std::shared_ptr>> lod_start_pos_; -``` - -Or more clearly defined here - -```c++ -typedef std::vector level_t; -std::vector lod_start_pos; -``` -Each `level_t` here stores a level of offset information consistent with paddle's current practice. - -In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1]. -Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces. -Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`. -The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation. - -The definition of `LODTensor` is as follows: - - -```c++ -class LODTensor : public Tensor { -public: - size_t Levels() const { return seq_start_positions_.size(); } - size_t Elements(int level = 0) const { - return seq_start_positions_[level].size(); - } - // slice of level[elem_begin: elem_end] - // NOTE low performance in slice seq_start_positions_. - // TODO should call Tensor's Slice. - LODTensor LODSlice(int level, int elem_begin, int elem_end) const; - - // slice with tensor's data shared with this. - LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const; - - // copy other's lod_start_pos_, to share LOD info. - // NOTE the LOD info sould not be changed. - void ShareConstLODFrom(const LODTensor &other) { - lod_start_pos_ = other.lod_start_pos_; - } - // copy other's lod_start_pos_'s content, free to mutate. - void ShareMutableLODFrom(const LODTensor &other) { - lod_start_pos_ = std::make_shared < - std::vector>(other.lod_start_pos_.begin(), - other.lod_start_pos_.end()); - } - -private: - std::shared_ptr>> lod_start_pos_; -}; -``` -Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication. -`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`. - -## How to support the framework -### Replace `Tensor` with `LoDTensor` -To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`. -Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`. - -In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer. - -### Transmit `lod_start_pos` along with the Op call chain -`lod_start_pos` is passed along with the Op call chain -The framework needs to support the following features to implement the transmit of `lod_start_pos`: - -1. Implement the transfer as `shared_ptr` - - Do not modify the contents of `lod_start_pos` as a consumer - - Modify producer of `lod_start_pos` as producer - - Conventions consumer only needs to copy `shared_ptr` passed over - - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer - - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once. - -2. Op is transparent enough not to sense `lod_start_pos` -3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run` - -## sorted by length -After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation. - -For example, the original input: - -``` -origin: -xxxx -xx -xxx - --> sorted: -xxxx -xxx -xx -``` - -After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement) - -``` -0 1 2 3 -x x x x -x x x -x x -``` - -In order to track the changes before and after sorting, use here - -```c++ -struct SortedSeqItem { - void *start{nullptr}; - void *end{nullptr}; -}; - -std::vector sorted_seqs; -``` -To track the position of the sequence after sorting, and add a new interface - -```c++ -std::vector SortBySeqLen(const LODTensor& tensor); -``` -Due to the sequence of input sequences, the following existing interfaces need to be modified: - -- InitMemories, memory needs to be rearranged according to `sorted_seqs` -- SetmentInputs -- ConcatOutputs - -In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`. -It is passed in as an input to `RecurrentGradientOp`. - -## InitMemories -Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly. - -## SegmentInputs - -`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order. - -the transition is as follows: -``` -origin: -xxxx -xx -xxx - - | - | - \ / - ! -0 1 2 3 -x x x x -x x x -x x -``` -## ConcatOutputs -`ConcatOutputs` needs - -- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset) -- Concat each sequence as a regular mini-batch representation - -## references -1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail) diff --git a/doc/fluid/design/execution/if_else_op.md b/doc/fluid/design/execution/if_else_op.md deleted file mode 100644 index 26d140f06db4ecefa86be015eaa731ffddc6910c..0000000000000000000000000000000000000000 --- a/doc/fluid/design/execution/if_else_op.md +++ /dev/null @@ -1,51 +0,0 @@ -# The `IfElse` Operator - -PaddlePaddle's `IfElse` operator differs from TensorFlow's: - -- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas -- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. - -## Example - -The following PaddlePaddle program shows the usage of the IfElse operator: - -```python -import paddle as pd - -x = minibatch([10, 20, 30]) # shape=[None, 1] -y = var(1) # shape=[1], value=1 -z = minibatch([10, 20, 30]) # shape=[None, 1] -cond = larger_than(x, 15) # [false, true, true] - -ie = pd.ifelse() -with ie.true_block(): - d = pd.layer.add(x, y) - ie.output(d, pd.layer.softmax(d)) -with ie.false_block(): - d = pd.layer.fc(z) - ie.output(d, d+1) -o1, o2 = ie(cond) -``` - -A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. - -An equivalent C++ program is as follows: - -```c++ -namespace pd = paddle; - -int x = 10; -int y = 1; -int z = 10; -bool cond = false; -int o1, o2; -if (cond) { - int d = x + y; - o1 = z; - o2 = pd::layer::softmax(z); -} else { - int d = pd::layer::fc(z); - o1 = d; - o2 = d+1; -} -``` diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst deleted file mode 100644 index ed31b017429d168b2466d8f6b423f48bd5d78d1f..0000000000000000000000000000000000000000 --- a/doc/fluid/design/execution/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -执行流程 -------------- - -.. toctree:: - :maxdepth: 1 - - switch.md - if_else_op.md diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst deleted file mode 100644 index fcf846da348ff0bed707c42718e08314998fbac0..0000000000000000000000000000000000000000 --- a/doc/fluid/design/execution/index_en.rst +++ /dev/null @@ -1,8 +0,0 @@ -Execution Process --------------------------------------- - -.. toctree:: - :maxdepth: 1 - - switch.md - if_else_op.md diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md deleted file mode 100644 index 1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8..0000000000000000000000000000000000000000 --- a/doc/fluid/design/execution/switch.md +++ /dev/null @@ -1,31 +0,0 @@ -# Design Doc: Switch - -## Background - -Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. - -The following example shows the usage of `fluid.switch`. - -```python -a = fluid.Var(10) -b = fluid.Var(0) - -with switch() as switch: - with switch.case(fluid.less_equal(a, 10)): - fluid.print("Case 1") - with switch.case(fluid.larger(a, 0)): - fluid.print("Case 2") - with switch.default(): - fluid.print("Case 3") -``` - -## The Semantics - -1. A `switch` control-flow checks cases one-by-one. -1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. -1. It runs the first matched case, or the default case if there is one. -1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case. - -The above program should print and print only "Case 1". - -The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches. diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst deleted file mode 100644 index 31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914..0000000000000000000000000000000000000000 --- a/doc/fluid/design/index_cn.rst +++ /dev/null @@ -1,19 +0,0 @@ -设计思想 ------------- - -.. toctree:: - :maxdepth: 1 - - motivation/index_cn.rst - execution/index_cn.rst - concepts/index_cn.rst - data_type/index_cn.rst - memory/index_cn.rst - multi_devices/index_cn.rst - dynamic_rnn/index_cn.rst - concurrent/index_cn.rst - algorithm/index_cn.rst - network/index_cn.rst - modules/index_cn.rst - interface/index_cn.rst - dist_train/index_cn.rst diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst deleted file mode 100644 index 2bfee02ad4626633b08ddff747e2886faf9ba99f..0000000000000000000000000000000000000000 --- a/doc/fluid/design/index_en.rst +++ /dev/null @@ -1,19 +0,0 @@ -Design ------------- - -.. toctree:: - :maxdepth: 1 - - motivation/index_en.rst - execution/index_en.rst - concepts/index_en.rst - data_type/index_en.rst - memory/index_en.rst - multi_devices/index_en.rst - dynamic_rnn/index_en.rst - concurrent/index_en.rst - algorithm/index_en.rst - network/index_en.rst - modules/index_en.rst - interface/index_en.rst - dist_train/index_en.rst diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst deleted file mode 100644 index 69a8d9bad4fe88935b9fa87757abf0105ca8eb75..0000000000000000000000000000000000000000 --- a/doc/fluid/design/interface/index_cn.rst +++ /dev/null @@ -1,4 +0,0 @@ -多语言接口 ------------- - -TBD diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst deleted file mode 100644 index 22abc71f984aa5da7151d5ebf0c3bdbcc69a3624..0000000000000000000000000000000000000000 --- a/doc/fluid/design/interface/index_en.rst +++ /dev/null @@ -1,4 +0,0 @@ -Multi-Language Interface ------------------------ - -TBD diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md deleted file mode 100644 index 83ef97c99efeaf27a27f93f0cd3857c0f1bc812e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/ir/overview.md +++ /dev/null @@ -1,185 +0,0 @@ -## Motivation - -There is a `gap` between the `Program` defined by -user and the `Executable` that can be scheduled -efficiently on heterogeneous hardware, either locally -or distributedly. - -Usually, the `gap` is bridged by - -* A serious transformations with defined order. - -* These transformations usually involve -`insert, delete, clustering, split, dependency analysis`. - -* Has a simple way to verify and debug each transformation. - -* Flexible to add, remove or customize transformations to fit -the requirements of various algorithms (models) and hardware secenarios. - -Some other events also push us to a better unified pattern. - -* The deep learning framework is built around the concepts of graphs. -To leverage tools such as compilation (e.g. TVM and nGraph) or -cross-framework conversion (e.g. ONNX), we also need a intermediate -representation that can be connected to the rest of the ecosystem. - - -We need a unified pattern to naturally support the requirements -described above. The pattern should fit both training, inference -and other offline serielized model transformations. -Learned from LLVM and other deep learning framework, we draft the -design below. - - -## Design - -### Major Concepts - -#### Node - -`Node` represents an operation that performs some computation or -a variable that is input or output of operation. - -`Node`s are connected to other `Node`s via inputs and outputs. - -Other properties (maybe device placement information) can be added -to `Node` in the future if it's a -common requirement of many other `Pass`es. Otherwise, it should live -in a `Node` wrapper class that is private to some `Pass` or be -a local member of a `Pass`. - -#### Graph - -`Graph` contains a list of `Node`s, which are connected to -each other via inputs and outputs. - -TODO: Better definitions for the graph. - -`Graph` can also contain `Attribute`s. `Attribute`s -can be `any` thing. For example, it can be a list of "wraper" -nodes. The `wrapper` nodes compose `Node`s and provide -helper method for execution or transformation. `Attribute` -can also contain other things that describe some properties of -the `Graph` or `Graph` nodes. `Attribute` can be passed -across `Pass`. However, it should be used with care. - -```cpp -class Graph { - public: - explicit Graph(const ProgramDesc &program); - - bool Has(const std::string &attr_name) const; - - template - AttrType &Get(const std::string &attr_name) const; - - template - void Set(const std::string &attr_name, AttrType *attr); - const std::unordered_set &Nodes() const; - - // Create a normal variable with non-null VarDesc. - ir::Node *CreateVarNode(VarDesc *var_desc); - - // Create a normal runnable operator with OpDesc. - ir::Node *CreateOpNode(OpDesc *op_desc); - - // Create a control dependency var that connects 2 operations. The - // var doesn't hold any data. Other than that, it's no different from - // other var, considering dependency analysis. - ir::Node *CreateControlDepVar(); - - // A more free style way of creating a graph node. Mostly use for test - // or "copy" from another node. Avoid using it if possible. - ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type); - - // Clear all node information of the graph and return the ownership of the - // nodes. - std::vector> ReleaseNodes(); -}; -``` - -#### Pass - -`Pass` represents a transformation of `Graph`. Its input -is a `Graph` and its output is also a `Graph`. For example, -a `Pass` can simply print out the `Graph`. A `Pass` -can also fuse some `Graph`'s `Node`s. - -```cpp -class Pass { - public: - - std::unique_ptr Apply(std::unique_ptr graph) const { - // Some correctness check. - auto new_graph = ApplyImpl(std::move(graph)); - // Some correctness check. - return new_graph; - } - - // Get a reference to the attributed previously set. - template - AttrType &Get(const std::string &attr_name) const; - - // Set a pointer to the attribute. Pass takes ownership of the attribute. - template - void Set(const std::string &attr_name, AttrType *attr) ; - - // Set a pointer to the attribute. Pass doesn't take ownership. Caller - // should delete the attribute. - template - void SetNotOwned(const std::string &attr_name, AttrType *attr); - - protected: - virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0; -}; - -// In my_pass.cc -class MyPass : public Pass { - protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const override { - // do something. - return graph; - } -} -REGISTER_PASS(my_pass, MyPass) -.RequirePassAttr("places") -.RequireGraphAttr("dep_vars"); - - -// To use the pass. -auto my_pass = ir::PassRegistry::Instance().Get("my_pass"); -graph = my_pass->Apply(std::move(graph)); -// Note: to force link my_pass.cc, in the code: -USE_PASS(my_pass); -``` - -#### Optimize - -`Optimize` contains a series of `Pass` with defined order. -`Optimize` transforms a `Graph` that only contains raw -modeling logic to a `Graph` that can be run efficiently while -maintaining the original modeling logic. - - -### Optimize Process - -* Program is first converted to Graph. -* Graph goes through a series of Pass -* Graph is transformed from raw model logic to a -form that is efficient to execute. - -``` -// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor -auto graph = Graph(program); -graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah)); -// For more complex Pass, Optimize Process can provide Pass attributes. -auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass"); -mem_opt_pass.SetNotOwned("optimize_level", 1); -mem_opt_pass->Apply(std::move(graph)); -graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah)); -graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah)); -Executor exe; -exe.Run(graph); - -``` diff --git a/doc/fluid/design/memory/README.md b/doc/fluid/design/memory/README.md deleted file mode 100644 index 7cf61d089b39041b7a15184e0ea9211d14a66f5e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/memory/README.md +++ /dev/null @@ -1,141 +0,0 @@ -# Region-based Heterogeneous Memory Management -## Design - -### Usage - -To allocate 4KB CPU memory: - -```cpp -p = memory::Alloc(platform::CPUPlace(), 4*1024); -``` - -To allocate 4KB memory on the 3rd GPU: - -```cpp -p = memory::Alloc(platform::CUDAPlace(2), 4*1024); -``` - -To free memory and check the so-far used amount of memory on a place: - -```cpp -auto pl = platform::CUDAPlace(0); -p = memory::Alloc(pl, 4*1024); -cout << memory::Used(pl); -memory::Free(pl, p); -``` - -### API - -In `paddle/memory/memory.h` we have: - -```cpp -namespace memory { -template void* Alloc(Place, size_t); -template void Free(Place, void*); -template size_t Used(Place); -} // namespace memory -``` - -These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`: - -```cpp -template<> -void* Alloc(CPUPlace p, size_t size) { - return GetCPUBuddyAllocator()->Alloc(size); -} -``` - -and - -```cpp -template<> -void Alloc(CUDAPlace p, size_t size) { - return GetGPUBuddyAllocator(p.id)->Alloc(size); -} -``` - -Similar specializations exist for `Free` and `Used`. - -### Implementation - -`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions. - -```cpp -BuddyAllocator* GetCPUBuddyAllocator() { - static BuddyAllocator* a = NULL; - if (a == NULL) { - a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...); - } - return a; -} - -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static BuddyAllocator* as = NULL; - if (as == NULL) { - as = new BuddyAllocator*[platform::NumGPUs()]; - for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) { - as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...); - } - } - return as[gpu_id); -``` - -#### `BuddyAllocator` - -`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm: - -```cpp -BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) { - ... -} -``` - -Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object: - -```cpp -class BuddyAllocator { - private: - struct Block { - size_t size; - Block* left, right; - size_t index; // allocator id - }; - ... -}; -``` - -Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace. - -#### System Allocators - -The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. - -## Justification - -I got inspiration from Majel and Caffe2, though above design look different from both. - -### Caffe2 - -In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479). - -There are two implementations of `Context`: - -1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. - -1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. - -### Majel - -In Majel, there are basically two allocator types: - -1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`. -1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`. - -However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces. - -In Majel there are hidden global variables like: - -1. `cpu::SystemAllocator g_cpu_allocator`, and -1. `vector g_gpu_allocators(NUM_GPUS)`. - -Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`. diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png deleted file mode 100644 index 3579998e58d07abc50bd3332128d4733a391cb3b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/memory/images/control_flow_graph.png and /dev/null differ diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png deleted file mode 100644 index c10f7f69f4007952e5b0394edaa04efa1cfbb658..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/memory/images/dataflow_equations.png and /dev/null differ diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png deleted file mode 100644 index 026becc4d94e01e407dacb2a5314a0e5723334ff..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/memory/images/deep_learning.png and /dev/null differ diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst deleted file mode 100644 index c507c638bd1a6eb428175ed2756a6ecfc6cca198..0000000000000000000000000000000000000000 --- a/doc/fluid/design/memory/index_cn.rst +++ /dev/null @@ -1,7 +0,0 @@ -内存管理 ------------- - -.. toctree:: - :maxdepth: 1 - - memory_optimization.md diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst deleted file mode 100644 index f7526437a73a09b300f05e138084755f5528b242..0000000000000000000000000000000000000000 --- a/doc/fluid/design/memory/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -Memory Management -------------------- - -.. toctree:: - :maxdepth: 1 - - memory_optimization.md diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md deleted file mode 100644 index 285464ada728d8f7a086a26beca6cfa4418e98e4..0000000000000000000000000000000000000000 --- a/doc/fluid/design/memory/memory_optimization.md +++ /dev/null @@ -1,217 +0,0 @@ -# Memory Optimization - - -## Problem - -In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these: - -- Availability of Big Data -- Supercomputing power to process this Big Data over very large neural networks -- Modern algorithms - -Following graph shows the details: - -![](images/deep_learning.png) - -Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. - -## Solution - -### Basic Strategy - -There are some basic strategies to improve memory usage, including in-place operations and memory sharing. - -#### In-place Operation -In a relu activation operator: - -$y = \max(x, 0)$ - -If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately. - -#### Memory Sharing - -Not all operators support in-place operations. Memory sharing is a more general strategy. - -Following is an example: - -``` -a = op1(b, c); -d = op2(a) -e = op3(d, f) -``` - -In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool. - - -### Live Variable Analysis - -It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation. - -In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. - -In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory. - -Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. - -We can leran these techniques from compilers. There are mainly two stages to make live variable analysis: - -- construct a control flow graph -- solve the dataflow equations - - -#### Control Flow Graph -To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y. - -Following is the flow graph for a simple loop. - -![](images/control_flow_graph.png) - -#### Dataflow Analysis - -Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program. - -A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes. - -- Flow Graph Terminology - -A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors. -In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}. - -- Uses and Defs - -An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}. - -- Liveness - -A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node. - - -The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula: - -![](images/dataflow_equations.png) - -### Memory optimization transpiler - -At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler. - -#### add in-place attribute - -In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator. - - -#### contruct control flow graph - -Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example. - -- Block0: - -``` -lookup_table -mul -... -while(sub-block idx 1) -... -array_to_lod_tensor -cross_entropy -... -while_grad(sub-block idx 2) -read_from_array -array_to_lod_tensor -... -``` - -- Block1 - -``` -read_from_array -read_from_array -... -write_to_array -increment -write_to_array -less_than -``` - -- Block2 - -``` -read_from_array -increment -... -write_to_array -write_to_array -``` - -We can transfer all the operators and variables in ProgramDesc to build a control flow graph. - -```python -class ControlFlowGraph(object): - def __init__(self, Program): - self._sucessors = defaultdict(set) - self._presucessors = defaultdict(set) - self._uses = defaultdict(set) - self._defs = defaultdict(set) - self._live_in = defaultdict(set) - self._live_out = defaultdict(set) - self._program = Program - - def build(self): - pass - - def dataflow_analysis(self): - pass - - def memory_optimization(self): - pass - - def get_program(self): - return self._program -``` - -#### Make dataflow analysis - -We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. - -For example: - -``` -a = op1(b, c); -d = op2(a) -e = op3(d, f) -``` - -The dataflow analysis result is: - -``` -live_in(op1) = {b, c, f} -live_out(op1) = {a, f} - -live_in(op2) = {a, f} -live_out(op2) = {d, f} - -live_in(op3) = {d, f} -live_out(op3) = {} -``` - -After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f. - -#### memory sharing policy - -A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables. - -``` -if op.support_inplace(): - i --> pool - pool --> o -else: - pool --> o - i --> pool -``` - - - -## Reference - -- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5) -- Modern compiler implementation in ML, by Andrew W. Appel -- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html) diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md deleted file mode 100644 index 20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/backward.md +++ /dev/null @@ -1,158 +0,0 @@ -# Backward Building - -## Motivation - -In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part. - -When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters. - -## Challenges - -The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. - -## Usage - -Although the whole algorithm is comprised of many functions, only one is exposed as API: - -```python -def append_backward(loss, parameter_list=None, no_grad_set=None): - """ - Append backward part to main_program - - Args: - loss(Variable): The variable generated by the cost function. - parameter_list(list): Parameters that need to be updated by optimizers. - If None, it means all parameters need to be updated. - - no_grad_set(set): Variables that have no gradients in Block 0. - If None, the set will be generated inside the function and - contains all variables with `step_gradient=True` from all blocks. - - Return: - (list[Variable]): list of (parameters, gradients) pair. - """ -``` - -By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run. - -This API will be invoked automatically before optimizer building. -As a result, in most cases, users do not need to invoke the API by themselves to append backward part. - -## Implementation - -The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. - -### Creating `grad_op`s - -The creating of `grad_op`s is implemented by: - -```python -def _append_backward_ops_(target, - block, - target_block, - no_grad_dict, - grad_to_var): - """ - Create all grad ops, and insert them into given block - - Args: - target(Variable): the target variable of forward pass - block(Block): the block where forward ops are - target_block(Block): the block which is going to hold new generated grad ops - no_grad_dict(dict): - key(int) block index - val(set) a set of varibale names. These varibales have no gradient - grad_to_var(dict)(output argument): - key(str): grad variable name - val(str): corresponding forward variable name - """ -``` - -Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. - -However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive. - -During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process: - -``` -******* pseudo-code ******** -for op in reversed(block.ops): - if op has an attribute named 'sub_block': - Get the sub-block(`s_block`) from op's attribute. - Create a new block(`grad_s_block`), whose father is `s_block`. - Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block` - - Invoke `core.get_grad_op_desc()` to get op's grad_op. - Insert name correspondings between variables and their gradients of the grad_op to grad_to_var - Assign grad_s_block to grad_op as it's 'sub_block' attribute. - Append grad_op to current target_block. -``` - -The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0). - -### Corner Cases of `grad_op` Creating - -In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s. - -#### Shared Variables - -If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. - -For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`... - -See function `_addup_repetitive_outputs_` in `backward.py` for implementation details. - -#### No Gradient Variables - -In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. - -Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped. - -It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. - -This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). - -### Creating Backward Variables - -Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by: - -```python -def _append_backward_vars_(block, - start_op_idx, - grad_to_var, - grad_info_map): - """ - Create new variables required by backward pass. - - Args: - block(Block): the block where new variables will be created - start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created - grad_to_var(dict): - key(str): grad variable name - val(str): corresponding forward variable name - In most cases, this dict is generated by _append_backward_ops_() - grad_info_map(dict)(output argument): - key(str): forward variable name - val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index - """ -``` - -Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process: - -``` -for op in block.ops[start_op_idx : ]: - - if op has an attribute named 'sub_block': - Get the sub-block(`s_block`) from op's attribute. - Invoke _append_backward_vars_(), with `block=s_block` - - for var_name in op.all_output_names(): - if block.has_var_recursive(var_name) or var_name is the name of empty variable: - continue - create a new variable named 'var_name' in block - if grad_to_var.has_key(var_name): - set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block) - - do op's var type inference - do op's shape inference -``` diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md deleted file mode 100644 index e451ffcc73b5de2b911e1c6de54b42a5d1d54c37..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/batch_norm_op.md +++ /dev/null @@ -1,134 +0,0 @@ -# Batch Normalization - -## What is batch normalization - -Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. - -The principle of batch normalization can be summarized into a simple function: - -``` -y = (x - E[x]) / STD[x]) * scale + bias -``` - -`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`. - -In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python. - -## Differences with normal operators - -`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design. - -1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors. - -2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations: - -``` -if batch_id == 0 - estimated_mean = E[x] -else - estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x] -``` - -The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed. - -## Implementation - -Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python. - -### C++ - -As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels. - -#### Inputs - -- `x`: The inputs data, which is generated by the previous layer. -- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`. -- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`. -- `scale`: trainable parameter 'scale' -- `bias`: trainable parameter 'bias' - -#### Outputs - -- `y`: The output data. -- `batch_mean`: The mean value of batch data. -- `batch_var`: The standard deviation value of batch data. -- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`. -- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`. - -#### Attributes - -- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode. -- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning. -- `epsilon`: *float*. The epsilon value to avoid division by zero. -- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above. - -#### Kernels - -The following graph showes the training computational process of `batch_norm_op`: - - - -cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel. - -### Python - -`batch_norm_op` is warpped as a layer in Python: - -```python -def batch_norm_layer(net, - input, - output, - scale, - bias, - use_global_est = False, - epsilon = 1e-6, - momentum = 0.99): - mean_cache = scope.new_var(name = 'estimated_mean', trainable = False) - var_cache = scop.new_var(name = 'estimated_var', trainable = False) - batch_mean = scope.new_var(name = 'batch_mean') - batch_var = scope.new_var(name = 'batch_var') - batch_norm_op = Operator('batch_norm_op', - x = input, - estimated_mean = mean_cache, - estimated_mean = var_cache, - scale = scale, - bias = bias, - y = output, - batch_mean = batch_mean, - batch_var = batch_var, - saved_mean = mean_cache, - saved_var = var_cache, - is_infer = False, - use_global_est = use_global_est, - epsilon = epsilon, - momentum = momentum) - net.append_op(batch_norm_op) - return output -``` - -Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note: - -1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch. - -2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this: - -```python -for pass_id in range(PASS_NUM): - # ... - net.train() # run training model - if pass_id % 100 == 0: - net.infer(test_image) # run inferencing model - # ... -``` - -`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`: - -
- -
- -Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. - -When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed. - -How to set a target is related to Python API design, so I will leave it here waiting for more discussions. diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md deleted file mode 100644 index de9605b0e67a035ab1ef1e4cafbe838f83bc5807..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/evaluator.md +++ /dev/null @@ -1,58 +0,0 @@ -# Evaluator Design - -## Problem Statement - -During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. - -## Evaluator Design -Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. - -1. Initialize the metric state and add it into the block. - -2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. - - -3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. - -## Implementation -This design is shown in the Python API. -Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. - - -```python -class Evaluator(object): - """ - Evaluator Base class. - """ - def __init__(self, name, **kwargs): - """ - Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts. - Auc need four variables, `true_positives`, - `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program - - The initialization of Evaluator should be responsible for: - create metric states and append to the main_program - """ - pass - - def _update_ops(self, input, label, **kwargs) - """ - Add mini-batch evaluator caculate operators to the main_program. - Add increment operator to accumulate the metric states. - """ - - - def reset(self, executor, reset_program=None): - """ - Reset metric states at the begin of each pass/user specified batch number. - Execute the reset_program to reset the states. - """ - - - def eval(self, executor, eval_program=None): - """ - Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. - Execute the eval_program and return the result. - """ - return eval_result -``` diff --git a/doc/fluid/design/modules/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot deleted file mode 100644 index 4bc47713cba2cb23f1b34fffe6426ef10ac3a9df..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/images/batch_norm_fork.dot +++ /dev/null @@ -1,25 +0,0 @@ -digraph ImageBatchNormForkGragh { - subgraph cluster_before { - Prev [label="...", shape=plaintext]; - Rnn [label="rnn_op", shape=box]; - BatchNorm [label="batch_norm_op", shape=box]; - Fc [label="fc_op", shape=box]; - After [label="...", shape=plaintext]; - Prev -> Rnn -> BatchNorm -> Fc -> After; - label="original"; - } - - subgraph cluster_after { - Prev2 [label="...", shape=plaintext]; - Rnn2 [label="rnn_op", shape=box]; - BatchNorm2_1 [label="train_batch_norm_op", shape=box]; - BatchNorm2_2 [label="infer_batch_norm_op", shape=box]; - Fc2_1 [label="fc_op", shape=box]; - Fc2_2 [label="fc_op", shape=box]; - After2_1 [label="...", shape=plaintext]; - After2_2 [label="...", shape=plaintext]; - Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1; - Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2 - label="forked"; - } -} diff --git a/doc/fluid/design/modules/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png deleted file mode 100644 index aded62bce5bc268b7a3ef4dc96c89fe21d6ea955..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/batch_norm_fork.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png deleted file mode 100644 index a99ce81ff3bf42880ebbd6a1297de3bf038e09b2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/batch_norm_op_kernel.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png deleted file mode 100644 index d312371a04c26aa6cd196e0bd1f51becb425180b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/feed_forward.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png deleted file mode 100644 index 677e99bfd9f8e72ed9fe4b27127af2ced202f447..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/feed_forward_regularized.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png deleted file mode 100644 index e1b9c7a44f94dc027598a98da93ddb8133190972..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/l1_regularization.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png deleted file mode 100644 index d5c2fcbc2ccae75ad083162e5a2dceb0210be298..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/l2_regularization.png and /dev/null differ diff --git a/doc/fluid/design/modules/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png deleted file mode 100644 index 14212ec8d36c803de96bde8a9a4b5591bd20434e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/modules/images/loss_equation.png and /dev/null differ diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst deleted file mode 100644 index b25783f0f5120991c29ba31b7b512bd4c183eecf..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/index_cn.rst +++ /dev/null @@ -1,14 +0,0 @@ -代码结构和重要模块 ------------------ - -.. toctree:: - :maxdepth: 1 - - backward.md - python_api.md - regularization.md - infer_var_type.md - optimizer.md - prune.md - register_grad_op.md - net_op_design.md diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst deleted file mode 100644 index 2108156e080996916f2650448f0a56f998757204..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/index_en.rst +++ /dev/null @@ -1,14 +0,0 @@ -Code Structure and Important Modules -------------------------------------- - -.. toctree:: - :maxdepth: 1 - - backward.md - python_api.md - regularization.md - infer_var_type.md - optimizer.md - prune.md - register_grad_op.md - net_op_design.md diff --git a/doc/fluid/design/modules/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md deleted file mode 100644 index d9d5397becba2ef1806d9341cd49cd9aabbf4a6a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/infer_var_type.md +++ /dev/null @@ -1,78 +0,0 @@ -# Design Doc: InferVarType - -## The Problem Posed - -The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. - -For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. - -The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. - -## Proposed Solution - -The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: - - -```c++ -using InferVarTypeFN = std::function< - void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; -``` - -It takes an operator description as its input and will write the output variable type and store them in block description. - -The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be - -```cpp -struct OpInfo { - InferVarTypeFN infer_var_type_; - ... -}; -``` - -The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. - -```cpp -void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { - // set the output type of variable as `LoDTensor`. - // ... -} - -struct OpInfo { - InferVarTypeFN infer_var_type_; - InferVarTypeFN GetInferVarType() const { - if (infer_var_type_) { - return infer_var_type_; - } else { - return DefaultInferVarType; - } - } -}; -``` - -## Register InferVarType - -We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. - -```cpp -class VarTypeInferer { -public: - virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; -} -``` - -Operator developers can write the specialize `VarTypeInferer` as follow. - -```cpp -class SpecialVarTypeInferer : public VarTypeInferer { -public: - virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { - // .. own logic - } -} -``` - -Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. - -``` -REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); -``` diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md deleted file mode 100644 index e64ac2fb1c6898bfeb883250347da3d9a4757b97..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/net_op_design.md +++ /dev/null @@ -1,250 +0,0 @@ -# Network Design - -`Network` is the container and controller of a set of operators, -user can build a real network from a `NetDesc` which is a protobuf message -and use `Network.Run()` to run all the operators in the network. - -A network object knows all Operators belonging to this network. Variables, -which are inputs and outputs of these operators, -are created and managed by a hierarchy of Scope objects. - -## API - -### Net -To make the `Network` extendable, a base class is defined like this - -```c++ -// operator's index stored in a network. -typedef int OpIndex; - -// The minimum a network should be implemented. -class Net { - public: - // run all the operators and return success(true) or not, with all the - // variables are located in `scope`. `context` describes the detail execution - // environment for ops. `begin` and `end` specify the scope of `ops_` to run, - // If no positive indexes are provided, all operators in `ops_` will run. - virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1, - OpIndex end = -1) const = 0; - - // Add an Operator according to `def`. - virtual OpIndex AddOp(const proto::OpDef &def) = 0; - - // Add optimizer operators acctording to `attrs`. - virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0; - - // Add backward operators. - virtual Error AddBackwardOps() = 0; - - // Infer the shapes of variables required by operators in the network. The - // `scope` will be mutated according to the inferred shapes. - - static std::unique_ptr Create(const NetDesc &def = NetDesc()); -}; -``` - -All network implementations should build networks from a protobuf message which -describes the structure of a real network; `Run` method should be implemented by -all implementations to offer a universal method to forward or backward compute a network. - -`Net::Create` is a method of factory pattern and can be implemented like - -```c++ -std::unique Net::Create(const NetDesc& def) { - switch (def.model_type()) { - case NN: - return new Network(def); - case Recursive: - return new RecursiveNet(def); - case Recurrent: - return new RecurrentNet(def); - } - return nullptr; -} -``` - -Network is designed as the container of operators. to make it more extendable, -we decouple it from the related variable resources. - -`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes. - -Finally, `Net` can be used as followed - -```c++ -Scope default_scope; -OpContext default_context; -auto net = Net::CreateNet(def); - -if (net) { - net.Run(&default_scope, &default_context); -} -``` - -### `PlainNet` as a simple implementation of `BaseNet` - -A very basic implementation is as follows. All it does is simply to run every operators in sequence. - -```c++ -class PlainNet : public Net { - public: - // Create a network describe by `def`. NetDesc is the definition of a network. - PlainNet(const NetDesc &def); - - // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch - training. - virtual Error InferShape(Scope *scope) override; - - // Run all the operators with the `scope`, if no scope is provided, default - // scope will be used instead. If no OpContext is provicded, default context will be used. - virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1, - OpIndex end = -1) const override; - - virtual OpIndex AddOp(const proto::OpDef &def) override; - - virtual Error AddOptimizerOps(const OptAttrs &attrs) override; - - virtual Error AddBackwardOps() override; - - protected: - // Create operators accordding to `def`, will be called by the constructor. - Error BuildNet(const NetDesc &def); - - // Add a operator which is identified as `type` and has attributes described - // in `attrs`, the `inputs` are the keys of readonly input variables, - // `outputs` are keys of mutable output variables. An `OpIndex` will be - // returned to indicate the offset of the new operator in `ops_`. - OpIndex AddOp(const std::string &type, const std::vector &inputs, - const std::vector &outputs, - const OprAttr &attrs = OprAttr()); - - private: - // the operators owned by `Network`. - std::vector ops_; -}; -``` - -`PlainNet` will create operators so that a private member `ops_` is defined, -the operators are created by `CreateNet`, and each operator is created by `AddOp`. - - -## PlainNet Usage -`PlainNet` can be used to define and run a network as follows - -```c++ -// create an empty scope located on CPU device. -Scope scope(CPUPlace()); - -// create and init variables described in `net_desc`. -scope.CreateVariables(net_desc); -scope.InitVariables(net_desc); - -// create a network according to `net_desc` -auto net = Net::CreateNet(net_desc); -// Add more operators if needed. -net->AddOp(add...); -net->AddOp(fc...); - -net->AddBackwardOps(); -net->AddOptimizerOps(); - -// run the network providing the `scope`. -net.Run(&scope); -``` - -## `NetBuilder` as a C++ syntax wrapper -This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage. - -The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces. - -```c++ -Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid"); -Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid"); -Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label); -Variable* avg_loss = builder.AddOp("mean", loss); - -builder.BackwardFrom(avg_loss) -builder.AddOptimization(1e-4, "adam"); -builder.Run(); -``` - -`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition - -```c++ -class NetBuilder final { - public: - NetBuilder(Net* net) : net_(net) {} - - Variable* AddOp(const string& type, const vector& inputs, - size_t size, Activation act) { - // much code here. - // ... - net_->AddOp(def); - need_rebuild_net_ = true; - net_->InferShape(); - // ... - } - - Error BackwardFrom(const Variable& cost); - - Error Run(Scope* scope, OpContext* context, bool need_backward = true) { - // backward. - if (need_backward) { - if (need_rebuild_net_) { - AddBackwardOps(); - AddOptimizerOps(); - } - net_->Run(scope, context); - return; - } - // just forward. - net_->Run(scope, context, 0, last_forward_op_); - } - - protected: - Error AddBackwardOps(); - Error AddOptimizerOps(); - - private: - Net* net_; - OpIndex last_forward_op_{-1}; - bool need_rebuild_net_{true}; -} -``` - -### Compatibility with RNN - -Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, -for example we can implement a simple recurrent neural network as follows - -```c++ -// copy some `vars` form `source` to `target` -void Copy(const Scope &source, Scope &target, - const std::vector &vars); - -Scope default_scope; -// some initial mutations on `default_scope` here. - -auto rnn_step_net = PlainNet(rnn_step_net_def); - -// Create rnn's states, the last scope is used to store rnn outputs. -Scope *rnn_states = new Scope[num_states + 1]; - -for (int i = 0; i < num_states + 1; i++) { - // Initialize all rnn state scopes, copy parameters and so on. - rnn_states[i].CreateVars(rnn_step_net_def); - Copy(default_scope, rnn_states[i], rnn_related_vars); - // Prepare rnn's inlinks, just copy inlink variables to each state. - Copy(default_scope, rnn_states[i], inlink_vars); -} - -// Run the rnn. -for (int i = 0; i < num_states; i++) { - rnn_step_net.Run(rnn_states[i]); - // Copy current state's state variables to next state, the related variables - // are named like "previous_state_xxx". - Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars) -} - -// Copy rnn's final outputs to `default_scope`. -Copy(rnn_states[num_states], default_scope, outlink_vars); -``` diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md deleted file mode 100644 index 1c25fde9cafb322f789662077d3fc6cc1d64ce38..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/optimizer.md +++ /dev/null @@ -1,91 +0,0 @@ -# Optimizer Design - -## The Problem - -A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: - -1. the forward pass, which computes intermediate results and the cost(s), -1. the backward pass, which derives gradients from intermediate results and costs, and -1. the optimization pass, which update model parameters to optimize the cost(s). - -These works rely on three kinds of operators: - -1. forward operators, -1. gradient operators, and -1. optimization operators. - -It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. - -In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. - - -## High-level Python API to describe the training process - -1. User write code to describe the network: - - ```python - images = layer.data("images") - labels = layer.data("labels") - w1 = pd.var("w1") - b1 = pd.var("b1") - hidden = layer.fc(images, w=w1, b=b1) - cost = layer.mse(hidden, labels) - ``` - - The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). - - -2. Users create a certain kind of Optimizer with some argument. - - ```python - optimizer = AdagradOptimizer(learing_rate=0.001) - ``` - -3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. - - ```python - opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) - ``` - The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. - -4. Users use Session/Executor to run this opt_op_list as target to do training. - - ```python - sess.run(target= opt_op_list, ...) - ``` - -### Optimizer Python interface: - -```python -class Optimizer(object): - """Optimizer Base class. - - """ - - def __init__(self): - pass - - def create_optimization_pass(self, parameters_and_grads): - """Add optimization operators to update gradients to variables. - - Args: - parameters_and_grads: a list of (variable, gradient) pair to update. - - Returns: - optmization_op_list: a list of optimization operator that will update parameter using gradient. - """ - return None - - def minimize(self, loss, parameter_list): - """Add operations to minimize `loss` by updating `parameter_list`. - - This method combines interface `append_backward()` and - `create_optimization_pass()` into one. - """ - params_grads = self.create_backward_pass(loss, parameter_list) - update_ops = self.create_optimization_pass(params_grads) - return update_ops - -``` - -Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/fluid/design/modules/prune.md b/doc/fluid/design/modules/prune.md deleted file mode 100644 index 4a5cf10c79a554779137f0cce5494fdd96ef6b7a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/prune.md +++ /dev/null @@ -1,63 +0,0 @@ -# Prune - -## Motivation - -We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement -`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc` -and generate a pruned `ProgramDesc`. - -## Challenge - -Pruning need to support both variables and operators being evaluation targets. Consider the following -different situations. - -```python -# Case 1: run foward pass. -cost_np = session.run(target=cost) -# Case 2: run backward passing. -opts_np, _ = session.run(target=[cost, opt]) -# Case 3: run checkpointing -_ = session.run(target=checkpoint) -``` - -## Solution - -To support evaluation of operators, we add `is_target` field in the `OpDesc`. - -```c++ -message OpDesc { - required string type = 3; - repeated Var inputs = 1; - repeated Var outputs = 2; - repeated Attr attrs = 4; - optional bool is_target = 5 [ default = false ]; -}; -``` - -To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599). -For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being -`fetch_op`'s input. Then we also set `fetch_op` is a target. - -### Algorithm - -If an operator needs to be run, it must fall into one of the following cases: - -1. It is the target. -2. It is depended by some other ops, meaning its output is some other op's input. - -The first case can be checked by `op_desc.is_traget()` . The second case can be implement as - -```c++ -bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) { - for (auto& var : op_desc.outputs()) { - for (auto& argu : var.arguments()) { - if (dependent_vars.count(argu) != 0) { - return true; - } - } - } - return false; -} -``` - -Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc). diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md deleted file mode 100644 index 83af4e55485c079265d3f2b1e15070825b532c02..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/python_api.md +++ /dev/null @@ -1,325 +0,0 @@ -# Design Doc: Python API - -Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. - - - - - - - - - - - - - - - - - - - - - - - - - - -
Python classesProtobuf messages
Program ProgramDesc
Block BlockDesc
Operator OpDesc
Variable VarDesc
- - -Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. - -## Core Concepts - -### Program - -A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. - -Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. - -```python -class Program(objects): - def __init__(self): - self.desc = core.NewProgram() # a C++ ProgramDesc pointer. - self.blocks = vector() - self.blocks.append(Block(self, -1)) # the global block - self.current_block = 0 # initialized to the global block - - def global_block(): - return self.blocks[0] - - def current_block(): - return self.get_block(self.current_block) - - def rollback(): - self.current_block = self.current_block().parent_idx - - def create_block(): - new_block_idx = len(self.block) - self.blocks.append(Block(self, self.current_block)) - self.current_block = new_block_idx - return current_block() -``` - -`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`. - -`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block. - -### Block - -A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes - -1. a map from variable names to an instance of the Python `Variable` class, and -1. a list of `Operator` instances. - -```python -class Block(objects): - def __init__(self, program, parent_idx): - self.desc = core.NewBlock(program.desc) - self.program = program - self.vars = map() - self.ops = vector() - self.parent_idx = parent_idx - - def create_var(self, ...): - return Variable(self, ...) - - def _create_global_var(self, ...): - program.global_block().create_var(...) - - def create_parameter(self, name, ...): - # Parameter is a subclass of variable. See Parameter section for details. - self.vars[name] = Parameter(self._create_global_var(...), ...) - return self.vars[name] - - def append_operator(self, ...): - self.ops.append(Operator(self, ...)) - - def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. - self.ops.prepend(Operator(self, ...)) -``` - -`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. - -`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. - -### Operator - -The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. - -```python -class Operator(object): - def __init__(self, - block, # Block - type, # string - inputs, # dict - outputs,# dict - attrs # dict - ): - self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) - core.infer_shape(self.desc, inputs, outputs) - - def type(self): - return self.desc.type() -``` - -`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. - -### Variable - -Operators take Variables as its inputs and outputs. - -```python -class Variable(object): - def __init__(self, - block=None, # Block - name=None, # string - shape, # tuple - dtype="float32", # string - lod_level=None # int - ): - if name is None: - name = unique_name_generator() - self.name = name - self.block = block - self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) - self.writer = None -``` - -Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. - -### Parameter - -A parameter is a global variable with an initializer (or load) operator. - -```python -class Parameter(Variable): - def __init__(self, - block=None, # Block - name=None, # string - shape, # tuple - dtype="float32", # string - lod_level=None # int - trainable, # bool - initialize_op_attrs, - optimize_op_attrs): - super(Parameter, self).__init__(block, name, shape, dtype, lod_level) - self.trainable = trainable - self.optimize_op_attrs = optimize_op_attrs - block.prepend(Operator(block, # Block - initialize_op_attrs['type'], # string - None, # no inputs - self, # output is the parameter - initialize_op_attrs) -``` - -When users create a parameter, they can call - -```python -program.create_parameter( - ..., - init_attr={ - type: "uniform_random", - min: -1.0, - max: 1.0, - }) -) -``` - -In above example, `init_attr.type` names an initialize operator. It can also name the load operator - -```python -init_attr={ - type: "load", - filename: "something.numpy", -} -``` - -`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. - -## Layer Function - -A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. - -Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). - -For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. - - -### Necessity for reusing code between layer functions - -There are a lot of code that can be reused. Such as - -* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. -* Append the activation operator. -* Create a temporary variable. -* Create parameter. -* Generate a unique name. -* Add a bias. -* ... - -A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. - - - -### Comparision between global functions and helper class - -The `FullyConnected` layer will be as follow when we provide global functions: - -```python -def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): - if name is None: - name = unique_name("fc") - input = multiple_input(input) - param_attr = default_param_attr(param_attr) - param_attr = multiple_param_attr(param_attr, len(input)) - - # mul - mul_results = [] - for ipt, attr in zip(input, param_attr): - shape = ipt.shape[1:] + [size] - w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) - tmp = create_tmp_var(name) - g_program.current_block().append_op("mul", {ipt, w}, {tmp}) - mul_results.append(tmp) - - # add sum - ... - # add bias - ... - # add activation - ... - return out -``` - -We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: - -1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. -2. Global functions will force layer developers to pass its parameter time by time. - -So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. - -```python -def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): - helper = LayerHelper(locals()) # pass all parameter to LayerHelper - - mul_results = [] - for ipt, param in helper.iter_multiple_input_and_param(): - w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) - tmp = helper.create_tmp_variable() - helper.append_op('mul', {ipt, w}, {tmp}) - mul_results.append(tmp) - - pre_bias = helper.add_sum(mul_results) - pre_activation = helper.add_bias(pre_bias) - return helper.add_activation(pre_activation) -``` - -We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. - - -### Implementation of layer helper - -We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: - -```python -class LayerHelper(object): - def __init__(self, **kwargs): # kwargs is short for `keyword arguments` - self.kwargs = kwargs - - def add_activation(self, input_var): - act = self.kwargs.get("act", None) # default value is None - if act is None: # do nothing if no act - return input_var - - tmp = self.create_tmp_var(self) - self.append_op(type=act, input=input_var, output=tmp) - return tmp -``` - -### Return value of layer functions - -The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example, - -1. Users can debug the network by printing parameter gradients. -2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute. - -However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing. - -The sample usage is - -```python -data = fluid.layers.data(...) -hidden = fluid.layers.fc(data, ...) -... - -executor.run(fetch_list=[hidden.param, hidden.param.grad], ...) -``` - - -## Optimizer - -[Optimizer Design Doc](./optimizer.md) diff --git a/doc/fluid/design/modules/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md deleted file mode 100644 index 8d973eb53178c3e889c845144553a453e11f067c..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/register_grad_op.md +++ /dev/null @@ -1,92 +0,0 @@ -# Design Doc: Gradient Operators Registration - - -## The Problem Posed - -Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance. - -However, we noticed two problems with the current design: - -1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. - -1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation. - -## The Current Implementation - -Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows - -```cpp -struct OpInfo { - std::function creator_; - std::string grad_op_type_; - ... -}; - -map OpInfoMap; - -OperatorBase* CreateGradientOperator(const OperatorBase& op) { - return OpInfoMap.at(op.Type()).creator_(...); -} -``` - -## Proposed Solution - -The mapping relationship between an operator and its gradient operators is a function. The interface of this function is: - -```cpp -// (OpDesc) --> vector -std::function(const OpDescBind&)>; -``` - -The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`. - -The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like - -```cpp -struct OpInfo { - std::function>(const OpDescBind&)> grad_op_maker_; - ... -}; -``` - -The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators. - -We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is - -```cpp -class GradOpDescMakerBase { -public: - GradOpDescMakerBase(const OpDescBind& ); - virtual std::vector> operator()()const = 0; -}; -``` - -We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by - -```cpp -using GradOpMaker = ...; -std::function(const OpDescBind&)> func; -func = [] (const OpDescBind& fwd_op) { - GradOpMaker maker(fwd_op); - return maker(); -}; -``` - -We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. - -We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. - -The user interface should be - -```cpp -vector MinusOpGradMaker(OpDesc) {...} -REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker); -// Developers can still manually implement gradient operator. -REGISTER_OPERATOR(minus_grad, MinusGradOp); -``` - -The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside. - -```cpp -REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp); -``` diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md deleted file mode 100644 index 519a9143033386678351ff78a465e5ba6e220c52..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/regularization.md +++ /dev/null @@ -1,66 +0,0 @@ -# Regularization in PaddlePaddle - -## Introduction to Regularization -A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore. - -### Parameter Norm Penalties -Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows: - -
- -The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`. - -The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows: - -##### L2 Regularization: -
- -##### L1 Regularization -
- -A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html). - -## Regularization Survey - -A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). - -## Proposal for Regularization in PaddlePaddle - -### Low-Level implementation - -In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations: -- L2_regularization_op -- L1_regularization_op - -These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. - -The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API. - -### Computation Graph - -Below is an example of a really simple feed forward neural network. - -
- -The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows: - -
-    -### Python API implementation for Regularization - -Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. - -#### Creation of Regularization ops -There are two possibilities for creating the regularization ops: -1. We create these ops immediately while building the computation graph. -2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. - -The proposal is to add these ops in a lazy manner just before the backward pass. - -#### Storage of Regularization attributes - -Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. - -#### High-level API - -In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers). diff --git a/doc/fluid/design/modules/selected_rows.md b/doc/fluid/design/modules/selected_rows.md deleted file mode 100644 index 1a98839a957612b91b2276b58818623ecc62d1d5..0000000000000000000000000000000000000000 --- a/doc/fluid/design/modules/selected_rows.md +++ /dev/null @@ -1,74 +0,0 @@ -# Design Doc: Selected Rows - -`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: - -```cpp -class SelectedRows { - private: - vector rows_; - Tensor value_; - int height_; -}; -``` - -The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. - -Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: - -``` -x = SelectedRow { - rows = [73, 84], - value = [[1, 2], [3,4]] -} -``` - - -## SelectedRows in Protobuf - -`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. -So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. - -```proto -message TensorDesc { - required DataType data_type = 1; - repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] -} - -message LodTensorDesc { - required TensorDesc tensor = 1; - optional int lod_level = 2; -} - -message VarDesc { - required string name = 1; - enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - } - required VarType type = 2; - optional LodTensorDesc lod_desc = 3; - optional TensorDesc selected_rows_desc = 4; - optional bool persistable = 5 [ default = false ]; -} -``` - -## InferShape for Selected Rows - -Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. - -For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following - -```cpp -void TableLookupGrad::InferShape(context) { - ... - context.SetDataType("Embedding.Grad", kSelectedRows); -} -``` - - -## Sparse Operators - -There are several operators that need to be written to support `SelectedRows`. These are: - -1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. -2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md deleted file mode 100644 index bc222564e3ec28e306ca0572b6a23104f6e9cbc5..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/api.md +++ /dev/null @@ -1,261 +0,0 @@ -# PaddlePaddle Design Doc - -## Ingredients - -As our design principle is starting from the essence: how could we -allow users to express and solve their problems as neural networks. -Some essential concepts that our API have to provide include: - -1. A *topology* is an expression of *layers*. - -1. A layer could be any kind of computation, including *cost*. - -1. Some layers have parameters, some don't. Most costs don't have - parameters. - -1. In some topologies, layers share parameters. For - example, - [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850). - -1. At programming time, users specify topologies and possible sharing - of parameters. PaddlePaddle can figure out and create parameters - required (and possibly shared) by one or more topologies. - - -## Starting from Examples - -As a summarization -of -[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315), -let us present two examples here: - - -### Example 1. Sharing Parameters between Layers - -We use -the -[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model -in this example. For your convenience, I copy-a-paste the model's -topology as follows: - -``` -A -> f -\ -Q -> f --> cost -B -> f -/ -``` - -The following program trains the topology including the cost, and then -use the sub-network in the trained topology in inference: - -```python -def f(in): - e = paddle.layer.embedding(in, parameter_name="embedding") - o = paddle.layer.softmax(e, parameter_name="semantic") - return o - -# Create 3 topologies (subnets), they share parameters because all -# correspoinding layers have the same parameter names. -fA = f(paddle.layer.data(input_name="A")) -fB = f(paddle.layer.data(input_name="B")) -fQ = f(paddle.layer.data(input_name="Q")) - -topology = paddle.layer.less_than( - paddle.layer.cross_entropy(fA, fQ), - paddle.layer.corss_entropy(fB, fQ)) - -# Derive parameters required in topology and create them in model. -parameters = paddle.parameters.create(topology) - -# Estimate parameters used in topology from data. -paddle.train(topology, parameters, reader=read_ranking_model_data) - -# Inference using fA (or fB or fC, as they share their parameters). -[testA, testB, testQ] = read_ranking_model_data() -print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA) -``` - - -### Example 2. Sharing Parameters between "Models" - -We use GAN in this example. In the following example program, `d0` and `d1` -correspond to the two networks in the following figure: - - - -```python -def G(in): - # over-simplified example as G has only one layers: - return paddle.layer.fc(in, parameter_name="G") - -def D(in); - # again, over-simplified: - return paddle.layer.fc(in, parameter_name="D") - -# Construct the first topology, which contains both D and G. -# By learning this topology, we update parameters of G. -d0 = paddle.layer.should_be_false(D(G(paddle.layer.data()))) - -# Construct a second topology d1, which contains only D. By -# training this topology, we update parameters of D. Note -# that d1 share parameters with d0. -d1 = paddle.layer.should_be_true(D(paddle.layer.data())) - -# Create parameters from a list of multiple topologies (models) for -# the chance to share parameters between these topologies. -parameters = paddle.parameters.create([d0, d1]) - -# Iterative training of GAN. -for ...: - train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"}) - train(d1, parameters, reader=read_from_realistic_images) - -# Use d1 for inference: -print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images) -``` - - -### Summarization - - -Above two programs reveal some important design concerns: - -1. Users describe a topology as an expression of layers. Every layer - has a *parameter name*. If the users don't specify it explicitly, it's automatically generated as a unique name. By - specifying the parameter name, users can specify the sharing of - parameters between layers and even between topologies. - -1. `paddle.parameters.create` figures out parameters required by one - or more topologies from parameter names of layers. It creates these - parameters and returns a `ParameterSet` object, which is in essence - a map from *parameter names* to *parameters*. - -1. At training and inference time, `paddle.train` and `paddle.infer` - requires both a topology and the parameter set that holds the parameters of that topology. There are some reasons: - - 1. This prevents users from forgetting to call - `paddle.parameters.create`. - 1. `paddle.train` needs to know which parameter set to update. - 1. Users could load another (pre-trained) parameter set and use it - with a topology in `train.infer`. - -1. By specifying the `immutable_parameters` parameter of - `paddle.train`, we can forbid the update of these parameters. - - -## Reader - -Not all programming frameworks allow users to define I/O functions. -An example is Google MapReduce, which can only read from text, -SSTable, and RecordIO files. Hadoop MapReduce allows users to define -readers and writers by deriving from base classes `Reader` and -`Writer`. The former is less flexible but also less error-prone. We -decide to provide the flexibility to users to define their readers. - - -There are some open questions here: - -1. **Should a reader return a Python dictionary?** - -1. **How to map multiple outputs from a reader to multiple data layers?** - -1. **How to easily compose some existing readers to read more data and - feed a topology with more data layers?** - - -## Training - -The recommended way to training a model is to call `paddle.train`, -which simply calls `paddle.trainer.Default`, a global variable of -type `paddle.trainer.SGD`. Equivalently, we can do - -```python -opt = paddle.trainer.SGD(..., paddle.updater.Adam(...)) -opt.train(topology, parameters, reader=read, ...) -``` - -### Updater - -Please be aware that a trainer can accept an updater as its data -member, where an updater is a class derived from -`paddle.trainer.Updater`. This is to make it easier to customize -trainers, as discussed -[here](https://github.com/PaddlePaddle/Paddle/issues/1319). - -### Event Handler - -`paddle.train` and `paddle.trainer.XXX.train` take an optional -parameter `event_handler`, which should be either `None` or a function -that handle some events: - -1. BeginTraining -1. EndTraining -1. BeginIteration -1. EndIteration -1. BeginPass -1. EndPass - -where EndPass is sent if and only if the reader yields -`end_pass=True`. - -An example as follows: - -```python -def event_handler(event): - if ininstance(event, paddle.event.EndIteration): - print paddle.test(...) - -paddle.train(topology, parameters, reader, event_handler) -``` - -If we are writing a PaddlePaddle program in and for iPython/Jypyter, -we can use metaplotlib in the event handler to plot a curve of -cost/error versus iterations, as shown -[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/). - -### Distributed Training - -If users want to do distributed training on a cluster, s/he should -call `paddle.dist_train` and provides access tokens to the cluster as -a parameter. - -For example, if the user has a TLS certificate that allows him to -access a Kubernetes cluster, s/he should be able to call - -```python -paddle.dist_train(model, - trainer=paddle.trainer.SGD(..., - paddle.updater.Adam(...)), - reader=read, - k8s_user="yi", - k8s_token="kube_cluster_tls.pem", - k8s_job="hello", - num_parameter_servers=15) -``` - -The pseudo code of `paddle.dist_train` is as follows: - -```python -def dist_train(topology, parameters, trainer, reader, ...): - if os.getenv("KUBERNETES_SERVICE_HOST") == None: - image_name = k8s_user + '/' + k8s_job - docker_build(image_name) - docker_push() - kube_ctrl_start_job(image_name, k8s_user, k8s_token) - else: - rank = kube_list_containers_in_job_and_return_current_containers_rank() - if rank == 0: - master() - elif rank < 15: - parameter_server() - else: - trainer.train(model, reader=read) -``` - -Please be aware that if a process is running on the Kubernetes -cluster, it will have some environment variables pre-defined. - -If `dist_train` doesn't see these environment variables, it knows -that it's running on users' personal computer, and it should work as a -*launcher*. Otherwise, it knows that it's running on the cluster and -need to figure out its role as either the master, or a trainer, or a -parameter server. diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle deleted file mode 100644 index c933df2cb855462c52b2d25f7f9a99b95652961d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/motivation/fluid-compiler.graffle and /dev/null differ diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png deleted file mode 100644 index 1b0ffed2039c91a3a00bbb719da08c91c3acf7bb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/motivation/fluid-compiler.png and /dev/null differ diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md deleted file mode 100644 index 4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/fluid.md +++ /dev/null @@ -1,140 +0,0 @@ -# Design Doc: PaddlePaddle Fluid - -## Why Fluid - -When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework? - -Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning. - -## The Evolution of Deep Learning Systems - -Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Existed sincemodel as sequence of layersmodel as graph of operatorsNo model
2013 Caffe, Theano, Torch, PaddlePaddle
2015 TensorFlow, MxNet, Caffe2, ONNX, n-graph
2016 PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid
- - -From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these. - -## Deep Learning Programming Paradigms - -With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following: - -```python -x = layer.data("image") -l = layer.data("label") -f = layer.fc(x, W) -s = layer.softmax(f) -c = layer.mse(l, s) - -for i in xrange(1000): # train for 1000 iterations - m = read_minibatch() - forward({input=x, data=m}, minimize=c) - backward(...) - -print W # print the trained model parameters. -``` - -The above program includes two parts: - -1. The first part describes the model, and -2. The second part describes the training process (or inference process) for the model. - -This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt. - -This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following: - -```python -W = tensor(...) - -for i in xrange(1000): # train for 1000 iterations - m = read_minibatch() - x = m["image"] - l = m["label"] - f = layer.fc(x, W) - s = layer.softmax(f) - c = layer.mse(l, s) - backward() - -print W # print the trained model parameters. -``` - -We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop. - -## Describe Arbitrary Models for the Future - -Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet. - -As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following: - -```python -for i in xrange(1000): - m = read_minibatch() - x = m["sentence"] - for t in xrange x.len(): - h[t] = the_step(x[t]) -``` - -With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following: - -```python -train_loop = layers.While(cond) -with train_loop.block(): - m = read_minibatch() - x = m["sentence"] - rnn = layers.While(...) - with rnn.block(): - h[t] = the_step(input[t]) -``` - -An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58). - -From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop. - -We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid. - -## Turing Completeness - -In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth. - -## The Execution of a Fluid Program - -There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). - -There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program. - -Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md). - -## Backward Compatibility of Fluid - -Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators. - -For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format. diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md deleted file mode 100644 index 6dd3840a0734e8593890dcf8044746197350c6f5..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/fluid_compiler.md +++ /dev/null @@ -1,110 +0,0 @@ -# PaddlePaddle Fluid: Towards a Compiled Programming Language - -As described in [fluid.md](fluid.md), when a Fluid application program -runs, it generates a `ProgramDesc` protobuf message as an intermediate -representation of itself. The C++ class `Executor` can run this -protobuf message as an interpreter. This article describes the Fluid -compiler. - -![](fluid-compiler.png) - -## ProgramDesc - -Before we go deeper into the idea of compiled language, let us take a -look at a simple example Fluid application. - -```python -import "fluid" - -func paddlepaddle() { - X = fluid.read(...) - W = fluid.Tensor(...) - Y = fluid.mult(X, W) -} -``` - -This program consists of a [block](../concepts/block.md) of three operators -- -`read`, `assign`, and `mult`. Its `ProgramDesc` message looks like -the following - -```protobuf -message ProgramDesc { - block[0] = Block { - vars = [X, W, Y], - ops = [ - read(output = X) - assign(input = ..., output = W) - mult(input = {X, W}, output = Y) - ], - } -} -``` - -## Transpilers - -We can write a transpiler program that takes a `ProgramDesc`, e.g., -the above one, and outputs another `ProgramDesc`. Let us take some -examples: - -1. *Memory optimization transpiler*: We can write a transpiler that - inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so - to free memory early, before the end of an iteration, so to keep a - small memory footprint. - -1. *Distributed training transpiler*: We can write a transpiler that - converts a`ProgramDesc` into its distributed version of two - `ProgramDesc`s -- one for running by the trainer processes and the - other for the parameter server. - -In the rest of this article, we talk about a special kind of -transpiler, *Native code generator*, which takes a `ProgramDesc` and -generates a `.cu` (or `.cc`) file, which could be built by C++ -compilers (gcc, nvcc, icc) into binaries. - -## Native Code Generator - -For the above example, the native code generator transpiler, say, the -CUDA code generator, should generate a `main` function: - -```c++ -void main() { - auto X = fluid_cuda_read(...); - auto W = fluid_cuda_create_tensor(...); - auto Y = fluid_cuda_mult(X, W); -} -``` - -and the definitions of functions `fluid_cuda_read`, -`fluid_cuda_create_tensor`, and `fluid_cuda_mult`. Please be aware -that each function could just define a C++ instance of an operator and -run it. For example - -```c++ -paddle::Tensor fluid_cuda_read(...) { - paddle::Tensor t; - paddle::operator::Read r(&t, ...); - r.Run(); - return t; -} -``` - -For computational operators that have multiple *kernels*, each for a -specific hardware platform, for example, the `mult` operator, the -generated code should call its CUDA kernel: - -```c++ -paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, - const paddle::Tensor& b) { - paddle::Tensor t; - paddle::operator::Mult m(a, b, ...); - Mult.Run(cuda_context); -} -``` - -where `cuda_context` could be a global variable of type -`paddle::CUDADeviceContext`. - -## Multi-Block Code Generation - -Most Fluid application programs may have more than one blocks. To -execute them, we need to trace [scopes](../concepts/scope.md). diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst deleted file mode 100644 index 7706e73eca644ed6db772fd77da947395313237f..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/index_cn.rst +++ /dev/null @@ -1,10 +0,0 @@ -设计动机和目标 -------------- - -.. toctree:: - :maxdepth: 1 - - api.md - refactorization.md - fluid.md - fluid_compiler.md diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst deleted file mode 100644 index 10b64b257c604ced6b957d6d6018e8a363f00fac..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/index_en.rst +++ /dev/null @@ -1,10 +0,0 @@ -Design Motivations and Goals --------------------------------------- - -.. toctree:: - :maxdepth: 1 - - api.md - refactorization.md - fluid.md - fluid_compiler.md diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md deleted file mode 100644 index ad9d0f6d3f3ad9884f108826e8410871fffd51bf..0000000000000000000000000000000000000000 --- a/doc/fluid/design/motivation/refactorization.md +++ /dev/null @@ -1,275 +0,0 @@ -# Design Doc: Refactorization Overview - -The goals of refactoring include: - -1. Making it easy for external contributors to write new elementary computation operations. -1. Making the codebase clean and readable. -1. Designing a new computation representation -- a computation graph of operators and variables. -1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs. - -## Computation Graphs - -1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs. - - 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example. - -1. Users write Python programs to describe the graphs and run them (locally or remotely). - -1. A graph is composed of *variables* and *operators*. - -1. The description of graphs must be serializable/deserializable, so that: - - 1. It can be sent to the cloud for distributed execution, and - 1. It can be sent to clients for mobile or enterprise deployment. - -1. The Python program does two things - - 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to - 1. the C++ library `libpaddle.so` for local execution, - 1. the master process of a distributed training job for training, or - 1. the server process of a Kubernetes serving job for distributed serving. - 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message. - -## Description and Realization of Computation Graph - -At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. - -At runtime, the C++ program realizes the graph and runs it. - - - - - - - - - - - - - - - - - - - - - - - - - - -
Representation (protobuf messages)Realization (C++ class objects)
Data -VarDesc -Variable
Operation -OpDesc -Operator
Block BlockDesc Block
- - -The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). - -## Compilation and Execution - -1. Run a Python program to describe the graph. In particular, the Python application program does the following: - - 1. Create `VarDesc` to represent local/intermediate variables, - 1. Create operators and set attributes, - 1. Validate attribute values, - 1. Infer the type and the shape of variables, - 1. Plan memory-reuse for variables, - 1. Generate the backward graph - 1. Add optimization operators to the computation graph. - 1. Optionally, split the graph for distributed training. - -1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: - - 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block, - 1. realize local variables defined in the BlockDesc message in the new scope, - 1. a scope is similar to the stack frame in programming languages, - - 1. Create an instance of class `Block`, in which, - 1. realize operators in the BlockDesc message, - - 1. Run the Block by calling - 1. `Block::Eval(vector* targets)` for forward and backward computations, or - 1. `Block::Eval(vector* targets)` for optimization. - - -## Intermediate Representation (IR) - -```text -Compile Time -> IR -> Runtime -``` - -### Benefits of IR - -- Optimization - ```text - Compile Time -> IR -> Optimized IR -> Runtime - ``` -- Automatically send partitioned IR to different nodes. - - Automatic Data Parallelism - ```text - Compile Time - |-> Single GPU IR - |-> [trainer-IR-0, trainer-IR-1, pserver-IR] - |-> Node-0 (runs trainer-IR-0) - |-> Node-1 (runs trainer-IR-1) - |-> Node-2 (runs pserver-IR) - ``` - - Automatic Model Parallelism (planned for future) - ---- - -## Operator/OpWithKernel/OpKernel - -![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot) - ---- - -## Operator -![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot) - -* `Operator` is the fundamental building block of the user interface. - * Operator stores input/output variable names and attributes. - * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. - * Use `Run` to compute the `output` variables from the `input` variables. - ---- - -## OpWithKernel/Kernel - -![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot) - -* `OpWithKernel` inherits `Operator`. -* `OpWithKernel` contains a Kernel map. - * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`. - * `OpKernelKey` is the map key. Only device place now, but may be data type later. - ---- - -## Why separate Kernel and Operator - -* Separate GPU and CPU code. - * Make Paddle capable of running without GPU. -* Make one operator (which is a user interface) and create many implementations. - * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. ---- - -## Libraries for Kernel development - -* `Eigen::Tensor` contains basic math and element-wise functions. - * Note that `Eigen::Tensor` has broadcast implementation. - * Limit the number of `tensor.device(dev) = ` in your code. -* `thrust::transform` and `std::transform`. - * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. - * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. -* Hand-writing `GPUKernel` and `CPU` code - * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) ---- -## Operator Registration - -### Why is registration necessary? -We need a method to build mappings between Op type names and Op classes. - -### How is registration implemented? -Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. - ---- -## The Registry Map - -### `OpInfoMap` - -`op_type(string)` -> `OpInfo` - -`OpInfo`: - -- **`creator`**: The Op constructor. -- **`grad_op_type`**: The type of the gradient Op. -- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes. -- **`checker`**: Used to check attributes. - ---- -## Related Concepts - -### Op_Maker -It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37)) - -### Register Macros -```cpp -REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) -REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) -``` - ---- -## Registration Process -1. Write an Op class and its gradient Op class, if required. -2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. -3. Invoke the macro `REGISTER_OP`. This macro will - 1. Call maker class to complete `proto` and `checker` - 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` - ---- -## Backward Module (1/2) -### Create Backward Operator -- Mapping from forward Op to backward Op -![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) - ---- -## Backward Module (2/2) -### Build Backward Network -- **Input**: a graph of forward operators -- **Output**: a graph of backward operators -- **Corner cases in construction** - - Shared Variables => insert an `Add` operator to combine gradients - - No Gradient => insert a `fill_zero_grad` operator - - Recursive NetOp => call `Backward` recursively - - RNN Op => recursively call `Backward` on stepnet - - RNN Op => recursively call `Backward` on stepnet - - ---- -## Scope, Variable, Tensor - -* `Tensor` is an n-dimension array with type. - * Only dims and data pointers are stored in `Tensor`. - * All operations on `Tensor` are written in `Operator` or global functions. - * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) -* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. - * `step_scopes` in RNN is a variable and not a tensor. -* `Scope` is where variables are stored. - * map - * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. - ---- -## Block (in design) -### the difference between original RNNOp and Block -- As an operator is more intuitive than `RNNOp`, -- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, -- Fits the compile-time/ runtime separation design paradigm. - - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc` - - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. - ---- -## Milestone -- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, -- Model migration - - Framework development gives **priority support** to model migration, for example, - - the MNIST demo needs a Python interface, - - the RNN models require the framework to support `LoDTensor`. - - Determine some timelines, - - Frequently used Ops need to be migrated first, - - Different models can be migrated in parallel. -- Improve the framework at the same time -- Accept imperfection, concentrate on solving the specific problem at the right price. - ---- -## Control the migration quality -- Compare the performance of migrated models with old ones. -- Follow the google C++ style guide. -- Build the automatic workflow of generating Python/C++ documentations. - - The documentation of layers and ops should be written inside the code. - - Take the documentation quality into account when submitting pull requests. - - Preview the documentations, read and improve them from a user's perspective. diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst deleted file mode 100644 index 1f8439e8623e1c1ae9a12c24d08079f0ec3d761f..0000000000000000000000000000000000000000 --- a/doc/fluid/design/multi_devices/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -多设备支持 ------------- - -.. toctree:: - :maxdepth: 1 - - operator_kernel_type.md - kernel_selection.md - kernel_hint_design.md diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst deleted file mode 100644 index 819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2..0000000000000000000000000000000000000000 --- a/doc/fluid/design/multi_devices/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -Multi-Device Support ----------------------- - -.. toctree:: - :maxdepth: 1 - - operator_kernel_type.md - kernel_selection.md - kernel_hint_design.md diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md deleted file mode 100644 index 6edc14ca73b1abf824981b59511a9aca4e0f3b47..0000000000000000000000000000000000000000 --- a/doc/fluid/design/multi_devices/kernel_hint_design.md +++ /dev/null @@ -1,59 +0,0 @@ -# Kernel Hint Design - -## Problem -In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. - -In the current design, we use KernelType to describe one kernel. - -```cpp -struct KernelType { - Place place_; - DataType data_type_; - LayoutType layout_; -}; -``` - `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it. - -The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use. - -So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel. - -The problem is, how should we define and send the information for `GetExpectedKernelType` to use? - -## Solution - -### Potential choice -1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel. - -2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint. - -### Final choice -To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose. - -In C++ - -```cpp -const std::string kForceCPU = "force_cpu"; -const std::string kUseCUDNN = "use_cudnn"; -const std::string kUseMKLDNN = "use_mkldnn"; - -KernelType GetExpectedKernelType() { - if (Attr(kForceCPU)) { - return KernelType(CPUPlace, ...) - } else { - ... - } -} -``` - -In Python code - -```python -FORCE_CPU = core.kForceCPU() - -def xx_layer(..., force_cpu=false): - layer_helper = LayerHelper(...) - layer_helper.append_op( - type="xx", - attr={FORCE_CPU: force_cpu}) -``` diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md deleted file mode 100644 index 4d2aab87b8cf30d03075e96cc4c67070efaf963a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/multi_devices/kernel_selection.md +++ /dev/null @@ -1,101 +0,0 @@ -# Kernel Selection - -## Background -Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. - -The `OpKernelType ` is as follows: - -```cpp -struct OpKernelType { - Place place_; - DataType data_type_; - DataLayout data_layout_; - LibraryType library_type_; -}; -``` - -- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace. - -- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`. - -- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel. - -- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`. - -## Problem - -We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations. - -1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel. -2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem. -3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`. - -Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2. - -If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2. - -``` -OP1(CPUPlace) - | - op1_2_op2 - | -OP2(CPUPlace) -``` - -If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly. - -Problems under these situations are similar. We can formalize this problem as follow. - -We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$. - -## Solution: data transform - -It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods. - -We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable. - -We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`. - -We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type. - -The algorithm is described as following - -```cpp -void OperatorWithKernel::Run( - const Scope& scope, - const platform::Place& place) const { - ExecutionContext ctx(...); - auto expected_kernel_key = this->GetExpectedKernelType(ctx); - - Scope& new_scope = scope.NewScope(); - - for (auto& var_name : this->Inputs()) { - auto* tensor_in = GetTensor(var_name); - auto kernel_type_for_var = this->GetKernelTypeForVar(...); - if (kernel_type_for_var.place_ != expected_kernel_key.place_) { - auto* trans_var = new_scope.Var(var_name); - auto* out = TransformData(expected_kernel_key, - kernel_type_for_var, - *tensor_in); - SetTensorToVariable(...); - } - } - - auto kernel = kernels.find(expected_kernel_key); - kernel->Compute(ExecutionContext(...)); -} -``` - -then the actual process for the multi-device above will be: - -``` -OP1(CPUPlace) - | -op1_2_op2(on CPU) - | -[transform](from CPU to GPU) - | -op1_2_op2(on GPU) - | -OP2(CUDAPlace) -``` diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md deleted file mode 100644 index 5e391bd62b4f4e123a9a6f35b7adf5726f205635..0000000000000000000000000000000000000000 --- a/doc/fluid/design/multi_devices/operator_kernel_type.md +++ /dev/null @@ -1,91 +0,0 @@ -# Design Doc: The Keys of Operator Kernel Type -## Problem -An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows: - -```cpp -struct OpKernelType { - platform::Place place_; - proto::DataType data_type_; -}; -``` -For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github. - -It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`. - -We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices. - -For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`. - -Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration. - -## Solution - -There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`. - -```cpp -struct OpKernelType { - platform::Place place_; - platform::Library library_; - proto::DataType data_type_; - framework::Layout layout_; -}; -``` - -The details are as follows: - -### Place - -`Place` is defined as: - -```cpp -typedef boost::variant Place; -``` - -`Place` represents the device memory where data is located. - - -### Library - -One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable: - -```cpp -enum Library { Plain, MKLDNN, CUDNN }; -``` - -We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator. -A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle. - -If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created. - - -### DataType - - -`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported. - -### Layout - -Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout. - -Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework. - -- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable. - -- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW. - -- The inference of Layout is at run-time, not at compile-time. - -- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators. - -`Layout` is also defined as a enum variable: - -```cpp -enum Layout { - kNCHW, - kNHWC, -#ifdef PADDLE_WITH_MKLDNN - knChw8c - ... -#endif -}; -``` diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md deleted file mode 100644 index f32a5b7e8a4d820319a666dab4c3129360e2c924..0000000000000000000000000000000000000000 --- a/doc/fluid/design/network/deep_speech_2.md +++ /dev/null @@ -1,235 +0,0 @@ -# DeepSpeech2 on PaddlePaddle: Design Doc - -We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals: - -- Release a basic distributed implementation of DS2 on PaddlePaddle. -- Contribute a chapter of Deep Speech to PaddlePaddle Book. - -Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan. - -## Table of Contents - -- [Tasks](#tasks) -- [Task Dependency](#task-dependency) -- [Design Details](#design-details) - - [Overview](#overview) - - [Row Convolution](#row-convolution) - - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm) -- [Future Work](#future-work) -- [References](#references) - -## Tasks - -We roughly break down the project into 14 tasks: - -1. Develop an **audio data provider**: - - Json filelist generator. - - Audio file format transformer. - - Spectrogram feature extraction, power normalization etc. - - Batch data reader with SortaGrad. - - Data augmentation (optional). - - Prepare (one or more) public English data sets & baseline. -2. Create a **simplified DS2 model configuration**: - - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*). - - With only bidirectional-GRU (otherwise need *Task 4*). - - With only greedy decoder (otherwise need *Task 5, 6*). -3. Develop to support **variable-shaped** dense-vector (image) batches of input data. - - Update `DenseScanner` in `dataprovider_converter.py`, etc. -4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details): - - Lookahead convolution windows. - - Within-row convolution, without kernels shared across rows. -5. Build KenLM **language model** (5-gram) for beam search decoder: - - Use KenLM toolkit. - - Prepare the corpus & train the model. - - Create infererence interfaces (for Task 6). -6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT: - - Beam search with CTC. - - Beam search with external custom scorer (e.g. LM). - - Try to design a more general beam search interface. -7. Develop a **Word Error Rate evaluator**: - - update `ctc_error_evaluator`(CER) to support WER. -8. Prepare internal dataset for Mandarin (optional): - - Dataset, baseline, evaluation details. - - Particular data preprocessing for Mandarin. - - Might need cooperating with the Speech Department. -9. Create **standard DS2 model configuration**: - - With variable-length audio sequences (need *Task 3*). - - With unidirectional-GRU + row-convolution (need *Task 4*). - - With CTC-LM beam search decoder (need *Task 5, 6*). -10. Make it run perfectly on **clusters**. -11. Experiments and **benchmarking** (for accuracy, not efficiency): - - With public English dataset. - - With internal (Baidu) Mandarin dataset (optional). -12. Time **profiling** and optimization. -13. Prepare **docs**. -14. Prepare PaddlePaddle **Book** chapter with a simplified version. - -## Task Dependency - -Tasks parallelizable within phases: - - - - - - - - - - - - - - - - - - - - - - - - - - -
RoadmapDescription Parallelizable Tasks
Phase I Simplified model & components Task 1 ~ Task 8
Phase II Standard model & benchmarking & profilingTask 9 ~ Task 12
Phase III Documentations Task13 ~ Task14
- - -Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed! - -## Design Details - -### Overview - -Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost. - -Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge. - -The classical DS2 network contains 15 layers (from bottom to top): - -- **Two** data layers (audio spectrogram, transcription text) -- **Three** 2D convolution layers -- **Seven** uni-directional simple-RNN layers -- **One** lookahead row convolution layers -- **One** fully-connected layers -- **One** CTC-loss layer - -
-
-Figure 1. Archetecture of Deep Speech 2 Network. -
- -We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments. - -Key ingredients about the layers: - -- **Data Layers**: - - Frame sequences data of audio **spectrogram** (with FFT). - - Token sequences data of **transcription** text (labels). - - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required. -- **2D Convolution Layers**: - - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension). - - With striding for only the first convlution layer. - - No pooling for all convolution layers. -- **Uni-directional RNNs** - - Uni-directional + row convolution: for low-latency inference. - - Bi-direcitional + without row convolution: if we don't care about the inference latency. -- **Row convolution**: - - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs. - - Not nessesary if with bi-direcitional RNNs. - - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across. -- **Batch Normalization Layers**: - - Added to all above layers (except for data and loss layer). - - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Required Components PaddlePaddle Support Need to Develop
Data Layer I (Spectrogram) Not supported yet.TBD (Task 3)
Data Layer II (Transcription) paddle.data_type.integer_value_sequence -
2D Convolution Layer paddle.layer.image_conv_layer -
DataType Converter (vec2seq) paddle.layer.block_expand -
Bi-/Uni-directional RNNs paddle.layer.recurrent_group -
Row Convolution Layer Not supported yet.TBD (Task 4)
CTC-loss Layer paddle.layer.warp_ctc -
Batch Normalization Layer paddle.layer.batch_norm -
CTC-Beam search Not supported yet. TBD (Task 6)
- - -### Row Convolution - -TODO by Assignees - -### Beam Search with CTC and LM - -
-
-Figure 2. Algorithm for CTC Beam Search Decoder. -
- -- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: - - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; - - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary. -- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding. -- Such external scorer consists of language model, word count or any other custom scorers. -- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7) -- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. - - -## Future Work - -- Efficiency Improvement -- Accuracy Improvement -- Low-latency Inference Library -- Large-scale benchmarking - -## References - -1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. -2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. -3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873 diff --git a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg deleted file mode 100644 index 8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg and /dev/null differ diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png deleted file mode 100644 index 7f7e35f34223162d0f7f0ed97375909c43b830ae..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/network/images/beam_search.png and /dev/null differ diff --git a/doc/fluid/design/network/images/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png deleted file mode 100644 index 1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/network/images/ds2_network.png and /dev/null differ diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst deleted file mode 100644 index 3557d55fe4dbae1f712e0760ca15111ec6f6792d..0000000000000000000000000000000000000000 --- a/doc/fluid/design/network/index_cn.rst +++ /dev/null @@ -1,7 +0,0 @@ -复杂网络设计 ------------- - -.. toctree:: - :maxdepth: 1 - - sequence_decoder.md diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst deleted file mode 100644 index 73a7137236bdf0548d35721609351d6deca3013b..0000000000000000000000000000000000000000 --- a/doc/fluid/design/network/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -Complex Network Design ------------------------- - -.. toctree:: - :maxdepth: 1 - - sequence_decoder.md diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md deleted file mode 100644 index b95773c50ca0dcbd1b93529332e035d4de90faa8..0000000000000000000000000000000000000000 --- a/doc/fluid/design/network/sequence_decoder.md +++ /dev/null @@ -1,229 +0,0 @@ -# Design: Sequence Decoder Generating LoDTensors -In tasks such as machine translation and visual captioning, -a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. - -This documentation describes how to implement the sequence decoder as an operator. - -## Beam Search based Decoder -The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. - -In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. - -There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. - -During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . - -For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; -the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. - -## Changing LoD's absolute offset to relative offsets -The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. - -The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, -let's call this format the **absolute-offset LoD** for clarity. - -The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows -```python -[[0, 3, 9] - [0, 2, 3, 3, 3, 9]] -``` -The first level tells that there are two sequences: -- the first's offset is `[0, 3)` -- the second's offset is `[3, 9)` - -while on the second level, there are several empty sequences that both begin and end at `3`. -It is impossible to tell how many empty second-level sequences exist in the first-level sequences. - -There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. - -So let's introduce another format of LoD, -it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. - -For example, to represent the same sequences of the above data - -```python -[[0, 3, 6] - [0, 2, 3, 3, 3, 9]] -``` - -the first level represents that there are two sequences, -their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. - -The second level is the same with the relative offset example because the lower level is a tensor. -It is easy to find out the second sequence in the first-level LoD has two empty sequences. - -The following examples are based on relative-offset LoD. - -## Usage in a simple machine translation model -Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. - -The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. - -**Encoder** -```python -import paddle as pd - -dict_size = 8000 -source_dict_size = dict_size -target_dict_size = dict_size -word_vector_dim = 128 -encoder_dim = 128 -decoder_dim = 128 -beam_size = 5 -max_length = 120 - -# encoder -src_word_id = pd.data( - name='source_language_word', - type=pd.data.integer_value_sequence(source_dict_dim)) -src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim) - -src_word_vec = pd.lookup(src_embedding, src_word_id) - -encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim) - -encoder_ctx = pd.last_seq(encoder_out_seq) -# encoder_ctx_proj is the learned semantic vector -encoder_ctx_proj = pd.fc( - encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None) -``` - -**Decoder** - -```python -def generate(): - decoder = pd.while_loop() - with decoder.step(): - decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory - generated_ids = decoder.memory() # TODO init to batch_size s - generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s - - target_word = pd.lookup(trg_embedding, gendrated_ids) - # expand encoder_ctx's batch to fit target_word's lod - # for example - # decoder_mem.lod is - # [[0 1 3], - # [0 1 3 6]] - # its tensor content is [a1 a2 a3 a4 a5] - # which means there are 2 sentences to translate - # - the first sentence has 1 translation prefixes, the offsets are [0, 1) - # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) - # the target_word.lod is - # [[0, 1, 6] - # [0, 2, 4, 7, 9 12]] - # which means 2 sentences to translate, each has 1 and 5 prefixes - # the first prefix has 2 candidates - # the following has 2, 3, 2, 3 candidates - # the encoder_ctx_expanded's content will be - # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5] - encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) - decoder_input = pd.fc( - act=pd.activation.Linear(), - input=[target_word, encoder_ctx_expanded], - size=3 * decoder_dim) - gru_out, cur_mem = pd.gru_step( - decoder_input, mem=decoder_mem, size=decoder_dim) - scores = pd.fc( - gru_out, - size=trg_dic_size, - bias=None, - act=pd.activation.Softmax()) - # K is an config - topk_scores, topk_ids = pd.top_k(scores, K) - topk_generated_scores = pd.add_scalar(topk_scores, generated_scores) - - selected_ids, selected_generation_scores = decoder.beam_search( - topk_ids, topk_generated_scores) - - # update the states - decoder_mem.update(cur_mem) # tells how to update state - generated_ids.update(selected_ids) - generated_scores.update(selected_generation_scores) - - decoder.output(selected_ids) - decoder.output(selected_generation_scores) - -translation_ids, translation_scores = decoder() -``` -The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, -returns the result of the beam search algorithm. - -In this way, users can customize anything on the input or output of beam search, for example: - -1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. -2. Remove some specific candidate in `selected_ids`. -3. Get the final `translation_ids`, remove the translation sequence in it. - -The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), -so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). - -Both of them are two-level `LoDTensors`: - -- The first level represents `batch_size` of (source) sentences. -- The second level represents the candidate ID sets for translation prefix. - -For example, 3 source sentences to translate, and has 2, 3, 1 candidates. - -Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. - -For example, the previous state: - -* LoD is `[0, 1, 3][0, 2, 5, 6]` -* content of tensor is `a1 a2 b1 b2 b3 c1` - -the current state is stored in `encoder_ctx_expanded`: - -* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` -* the content is - - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) - - a2 a2 - - b1 b1 b1 - - b2 - - b3 b3 - - None (c1 has 0 candidates, so c1 is dropped) - -The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. - -The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: - -```python -decoder.output(selected_ids) -decoder.output(selected_generation_scores) -``` - -The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. - -Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. - -Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. - -## LoD and shape changes during decoding -

- -

- -According to the image above, the only phase that changes the LoD is beam search. - -## Beam search design -The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: - -1. `topk_ids`, the top K candidate ids for each prefix. -2. `topk_scores`, the corresponding scores for `topk_ids` -3. `generated_scores`, the score of the prefixes. - -All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. - -It will return three variables: - -1. `selected_ids`, the final candidate beam search function selected for the next step. -2. `selected_scores`, the scores for the candidates. -3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). - -## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` -The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, -so it is natural to store them in arrays. - -Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. - -The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. -It needs some extensions to support the packing or unpacking an array of `LoDTensors`. diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png deleted file mode 100644 index ab1c2ff23cfff586516876684348bb15bd2084fc..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/onnx/images/project_structure.png and /dev/null differ diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md deleted file mode 100644 index bc1665d7c33eb54cb63e5306a439c1ca67016d1e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/onnx/onnx_convertor.md +++ /dev/null @@ -1,131 +0,0 @@ -# Background - -[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices. - -Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models. - - -# How it works - -ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned. - -When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators. - -Here are a few major considerations when it comes to converting models: - -- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed. -- **Parameters (weights) initialization**: Setting initial parameters on different nodes. -- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid) -- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope. -- **Model validation**: There are two kinds of validations that are necessary: - 1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend. - 2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers. -- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release. - -One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch. - - -# Project structure - -

- -

- -The project contains four important parts: - -* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces. - -* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters. - -* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic. - -* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model. - - -# Usage -The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model. - -* Convert Fluid inference model to ONNX binary model - - ``` - python convert.py --fluid_model --onnx_model validate True - ``` - -* Validate the converted model - - ``` - python validate.py --fluid_model --onnx_model - ``` - -The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output. - - -# Challenges and mitigation - -## Cycles - -Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle. - -*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops. - -## Sequences - -Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors. - -*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors. - -## Ops that can't easily be mapped - -There are ops that just aren't possible to map today: - -**Control flow operators** - -Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase. - -*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them. - - -**Non-existent in Fluid** - -There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc. - -*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD. - - -**Concurrency** - -ONNX does not have any considerations for concurrency right now. - -*Resolution*: There are two ways to approach this: - -a. We choose to not support concurrent models. -b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach. - - -**Overloaded in Fluid** - -There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators. - -*Resolution*: Chain multiple Paddle operators. - - -## Lack of LoDTensors - -As stated above, ONNX only supports simple Tensor values. - -*Resolution*: Deprecate to plain old numpy-able tensors. - - -## Reconstruction from deprecated ONNX ops - -For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs. - -*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op. - - -# Supported models - -As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models. - -- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams. -- Recurrent models: language model, stacked LSTMs etc. diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md deleted file mode 100644 index 773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/auto_gradient_check.md +++ /dev/null @@ -1,150 +0,0 @@ -## Auto Gradient Check Design - -## Background: -- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges: - 1. The formula for backpropagation formula should be correct according to the forward computation. - 2. The Implementation of the above shoule be correct in CPP. - 3. It is difficult to prepare an unbiased test data. - -- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages: - 1. Numerical gradient checker only needs the forward operator. - 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator. - -## Mathematical Theory -The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful. - -- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) -- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) - - -## Numerical Gradient Implementation -### Python Interface -```python -def get_numerical_gradient(op, - input_values, - output_name, - input_to_check, - delta=0.005, - local_scope=None): - """ - Get Numerical Gradient for the input of an operator. - - :param op: C++ operator instance, could be an network. - :param input_values: The input variables. Should be an dictionary, whose key is - variable name, and value is a numpy array. - :param output_name: The final output variable name. - :param input_to_check: The input variable with respect to which the gradient has to be computed. - :param delta: The perturbation value for numerical gradient method. The - smaller the delta, the more accurate the result. But if the delta is too - small, it will suffer from the numerical stability problem. - :param local_scope: The local scope used for get_numeric_gradient. - :return: The gradient array in numpy format. - """ -``` - -### Explanation: - -- Why do we need an `output_name` - - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable. - -- Why do we need `input_to_check` - - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input. - - -### Core Algorithm Implementation - - -```python - # we only compute the gradient of one element a time. - # we use a for loop to compute the gradient of each element. - for i in xrange(tensor_size): - # get one input element using the index i. - original = tensor_to_check.get_float_element(i) - - # add delta to it, run the forward op and then - # get the new value of the result tensor. - x_pos = original + delta - tensor_to_check.set_float_element(i, x_pos) - y_pos = get_output() - - # Subtract delta from this element, run the op again - # and get the new value of the result tensor. - x_neg = original - delta - tensor_to_check.set_float_element(i, x_neg) - y_neg = get_output() - - # restore old value - tensor_to_check.set_float_element(i, original) - - # compute the gradient of this element and store - # it into a numpy array. - gradient_flat[i] = (y_pos - y_neg) / delta / 2 - - # reshape the gradient result to the shape of the source tensor. - return gradient_flat.reshape(tensor_to_check.get_dims()) -``` - -## Auto Gradient Check Framework - -Each Operator Kernel has three kinds of Gradient: - -1. Numerical gradient -2. CPU kernel gradient -3. GPU kernel gradient (if supported by the device) - -The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps: - -1. Calculate the numerical gradient -2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient. -3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported) - -#### Python Interface - -```python - def check_grad(self, - forward_op, - input_vars, - inputs_to_check, - output_name, - no_grad_set=None, - only_cpu=False, - max_relative_error=0.005): - """ - :param forward_op: used to create backward_op - :param input_vars: numpy value of input variable. The following - computation will use these variables. - :param inputs_to_check: the input variable with respect to which the - gradient will be computed. - :param output_name: The final output variable name. - :param max_relative_error: The relative tolerance parameter. - :param no_grad_set: used to create backward ops - :param only_cpu: only compute and check gradient on cpu kernel. - :return: - """ -``` - -### How to check if two numpy arrays are close enough? -if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad. - -```python -numerical_grad = ... -operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) - -abs_numerical_grad = numpy.abs(numerical_grad) -# if abs_numerical_grad is nearly zero, then use abs error for -# numeric_grad, instead of relative error. -abs_numerical_grad[abs_numerical_grad < 1e-3] = 1 - -diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad -max_diff = numpy.max(diff_mat) -``` - - -#### Notes: -The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. - - -#### References: - -- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) -- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/fluid/design/others/dcgan.png b/doc/fluid/design/others/dcgan.png deleted file mode 100644 index 15e8e290a111ff43900934341365cb4360d87d28..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/others/dcgan.png and /dev/null differ diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md deleted file mode 100644 index 7167470088766985fa5ad31657410309330fd725..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/gan_api.md +++ /dev/null @@ -1,253 +0,0 @@ -# Design for GAN - -GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. - -It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. - -In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. - -

-
-Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. -

- -The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. - -

-
-Figure 2. Photo borrowed from the original DC-GAN paper. -

- -## The Conditional-GAN might be a class. -This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: - -- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: - -- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. - -- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. -Returns a generated image. - -- discriminator(image): -Given an image, decide if it is from a real source or a fake one. -Returns a 0/1 binary label. - -- build_model(self): -build the whole GAN model, define training loss for both generator and discrimator. - -## Discussion on Engine Functions required to build GAN -- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) -- Different optimizers responsible for optimizing different loss. - -To be more detailed, we introduce our design of DCGAN as following: - -### Class member Function: Initializer -- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. -- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. -```python -class DCGAN(object): - def __init__(self, y_dim=None): - - # hyper parameters - self.y_dim = y_dim # conditional gan or not - self.batch_size = 100 - self.z_dim = z_dim # input noise dimension - - # define parameters of discriminators - self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) - self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data - self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) - self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data - self.D_W2 = pd.Varialble(np.random.rand(128, 1)) - self.D_b2 = pd.Variable(np.zeros(128)) - self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] - - # define parameters of generators - self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) - self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data - self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) - self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data - self.G_W2 = pd.Varialble(np.random.rand(128, 1)) - self.G_b2 = pd.Variable(np.zeros(128)) - self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] -``` - -### Class member Function: Generator -- Given a noisy input z, returns a fake image. -- Concatenation, batch-norm, FC operations required; -- Deconv layer required, which is missing now... -```python -class DCGAN(object): - def generator(self, z, y = None): - # input z: the random noise - # input y: input data label (optional) - # output G_im: generated fake images - - if not self.y_dim: - z = pd.layer.concat(1, [z, y]) - - G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) - G_h0_bn = pd.layer.batch_norm(G_h0) - G_h0_relu = pd.layer.relu(G_h0_bn) - - G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) - G_h1_bn = pd.layer.batch_norm(G_h1) - G_h1_relu = pd.layer.relu(G_h1_bn) - - G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) - G_im = pd.layer.tanh(G_im) - return G_im -``` - -### Class member function: Discriminator -- Given a noisy input z, returns a fake image. -- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; -```python -class DCGAN(object): - def discriminator(self, image): - # input image: either generated images or real ones - # output D_h2: binary logit of the label - - D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) - D_h0_bn = pd.layer.batchnorm(h0) - D_h0_relu = pd.layer.lrelu(h0_bn) - - D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) - D_h1_bn = pd.layer.batchnorm(D_h1) - D_h1_relu = pd.layer.lrelu(D_h1_bn) - - D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) - return D_h2 -``` - -### Class member function: Build the model -- Define data readers as placeholders to hold the data; -- Build generator and discriminators; -- Define two training losses for discriminator and generator, respectively. -If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: -```python -class DCGAN(object): - def build_model(self): - if self.y_dim: - self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) - self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) - self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) - self.z = pd.data(tf.float32, [None, self.z_size]) - - # step 1: generate images by generator, classify real/fake images with discriminator - if self.y_dim: # if conditional GAN, includes label - self.G = self.generator(self.z, self.y) - self.D_t = self.discriminator(self.images) - # generated fake images - self.sampled = self.sampler(self.z, self.y) - self.D_f = self.discriminator(self.G) - else: # original version of GAN - self.G = self.generator(self.z) - self.D_t = self.discriminator(self.images) - # generate fake images - self.sampled = self.sampler(self.z) - self.D_f = self.discriminator(self.images) - - # step 2: define the two losses - self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) - self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) - self.d_loss = self.d_loss_real + self.d_loss_fake - - self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) -``` - -If we do not have dependency engine but blocks, the module building our GAN model will be like this: -```python -class DCGAN(object): - def build_model(self, default_block): - # input data in the default block - if self.y_dim: - self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) - self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) - # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) - self.z = pd.data(tf.float32, [None, self.z_size]) - - # step 1: generate images by generator, classify real/fake images with discriminator - with pd.default_block().g_block(): - if self.y_dim: # if conditional GAN, includes label - self.G = self.generator(self.z, self.y) - self.D_g = self.discriminator(self.G, self.y) - else: # original version of GAN - self.G = self.generator(self.z) - self.D_g = self.discriminator(self.G, self.y) - self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) - - with pd.default_block().d_block(): - if self.y_dim: # if conditional GAN, includes label - self.D_t = self.discriminator(self.images, self.y) - self.D_f = self.discriminator(self.G, self.y) - else: # original version of GAN - self.D_t = self.discriminator(self.images) - self.D_f = self.discriminator(self.G) - - # step 2: define the two losses - self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) - self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) - self.d_loss = self.d_loss_real + self.d_loss_fake -``` -Some small confusion and problems with this design: -- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. -- Requires ability to create a block anytime, rather than in if-else or rnn only; - -## Main function for the demo: -Generally, the user of GAN just need to the following things: -- Define an object as DCGAN class; -- Build the DCGAN model; -- Specify two optimizers for two different losses with respect to different parameters. -```python -# pd for short, should be more concise. -from paddle.v2 as pd -import numpy as np -import logging - -if __name__ == "__main__": - # dcgan class in the default graph/block - # if we use dependency engine as tensorflow - # the codes, will be slightly different like: - # dcgan = DCGAN() - # dcgan.build_model() - with pd.block() as def_block: - dcgan = DCGAN() - dcgan.build_model(def_block) - - # load mnist data - data_X, data_y = self.load_mnist() - - # Two subgraphs required!!! - with pd.block().d_block(): - d_optim = pd.train.Adam(lr = .001, beta= .1) - d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) - with pd.block.g_block(): - g_optim = pd.train.Adam(lr = .001, beta= .1) - g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) - - # executor - sess = pd.executor() - - # training - for epoch in xrange(10000): - for batch_id in range(N / batch_size): - idx = ... - # sample a batch - batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] - # sample z - batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) - - if batch_id % 2 == 0: - sess.run(d_step, - feed_dict = {dcgan.images: batch_im, - dcgan.y: batch_label, - dcgan.z: batch_z}) - else: - sess.run(g_step, - feed_dict = {dcgan.z: batch_z}) -``` - -# More thinking about dependency engine v.s. block design: -- What if we just want to run an intermediate result? Do we need to run the whole block/graph? -- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/fluid/design/others/graph.md b/doc/fluid/design/others/graph.md deleted file mode 100644 index 7519a65df835a39fe14f6ef45530afff170191ff..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/graph.md +++ /dev/null @@ -1,70 +0,0 @@ -# Design Doc: Computations as a Graph - -A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before. - -This document explains that the construction of a graph as three steps: - -- construct the forward part -- construct the backward part -- construct the optimization part - -## The Construction of a Graph - -Let us take the problem of image classification as a simple example. The application program that trains the model looks like: - -```python -x = layer.data("images") -l = layer.data("label") -y = layer.fc(x) -cost = layer.mse(y, l) -optimize(cost) -train(cost, reader=mnist.train()) -``` - -### Forward Part - -The first four lines of above program build the forward part of the graph. - -![](images/graph_construction_example_forward_only.png) - -In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators. - -Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch. - -In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message. - -### Backward Part - -The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`. - -`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part. - -![](images/graph_construction_example_forward_backward.png) - -According to the chain rule of gradient computation, `ConstructBackwardGraph` would - -1. create a gradient operator G for each operator F, -1. make all inputs, outputs, and outputs' gradient of F as inputs of G, -1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and -1. make all these gradients as outputs of G. - -### Optimization Part - -For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph: - -![](images/graph_construction_example_all.png) - -## Block and Graph - -The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block. - -A Block keeps operators in an array `BlockDesc::ops` - -```protobuf -message BlockDesc { - repeated OpDesc ops = 1; - repeated VarDesc vars = 2; -} -``` - -in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators. diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md deleted file mode 100644 index 97f395133b48a1d0ed5136f0ebc8720b8ca87ded..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/graph_survey.md +++ /dev/null @@ -1,232 +0,0 @@ -## Survey on Graph - -Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json. - -### Mxnet - -The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet: - - -`Symbol` is help class used to represent the operator node in Graph. -`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value. - - -A simple network topology wrote by Symbol is as follows: - -```python -def get_symbol(num_classes=10, **kwargs): - data = mx.symbol.Variable('data') - data = mx.symbol.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) - act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) - act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) - mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') - return mlp -``` - - - -Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null. - -Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. - -And Symbol can be saved to a Json file. - -Here is a detailed example: - -``` ->>> import mxnet as mx ->>> data = mx.symbol.Variable('data') ->>> print data.debug_str() -Variable:data - ->>> data = mx.symbol.Flatten(data=data) ->>> print data.debug_str() -Symbol Outputs: - output[0]=flatten0(0) -Variable:data --------------------- -Op:Flatten, Name=flatten0 -Inputs: - arg[0]=data(0) version=0 - ->>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) ->>> print fc1.debug_str() -Symbol Outputs: - output[0]=fc1(0) -Variable:data --------------------- -Op:Flatten, Name=flatten0 -Inputs: - arg[0]=data(0) version=0 -Variable:fc1_weight -Variable:fc1_bias --------------------- -Op:FullyConnected, Name=fc1 -Inputs: - arg[0]=flatten0(0) - arg[1]=fc1_weight(0) version=0 - arg[2]=fc1_bias(0) version=0 -Attrs: - num_hidden=128 - -``` - - -### TensorFlow - - -The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow: - -A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session). - -A simple example is as follows: - -```python - # Build a dataflow graph. - c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) - d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) - e = tf.matmul(c, d) - - # Construct a `Session` to execute the graph. - sess = tf.Session() - - # Execute the graph and store the value that `e` represents in `result`. - result = sess.run(e) -``` - - -The main method of `Tensor` is as follows: - - -```python -@property -def op(self): - """The `Operation` that produces this tensor as an output.""" - return self._op - -@property -def dtype(self): - """The `DType` of elements in this tensor.""" - return self._dtype - -@property -def graph(self): - """The `Graph` that contains this tensor.""" - return self._op.graph - -@property -def name(self): - """The string name of this tensor.""" - if not self._op.name: - raise ValueError("Operation was not named: %s" % self._op) - return "%s:%d" % (self._op.name, self._value_index) - -@property -def device(self): - """The name of the device on which this tensor will be produced, or None.""" - return self._op.device -``` - - -Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency. - - -Here is a detailed example: - - -``` ->>> import tensorflow as tf ->>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) ->>> print c.graph - ->>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) ->>> print d.graph - ->>> e = tf.matmul(c, d) ->>> print e.graph - -``` - -### Dynet - - -The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++. - - -A simple example is as follows: - -```cpp -ComputationGraph cg; -Expression W = parameter(cg, pW); - -Expression in = input(cg, xs[i]); -Expression label = input(cg, ys[i]); -Expression pred = W * in; -Expression loss = square(pred - label); -``` - -The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. - -Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency. - - -Here is a detailed example: - -write topology in C++ - -``` -ComputationGraph cg; -Expression W = parameter(cg, pW); -cg.print_graphviz(); - -Expression pred = W * xs[i]; -cg.print_graphviz(); - -Expression loss = square(pred - ys[i]); -cg.print_graphviz(); -``` - -compile and print - -``` -# first print -digraph G { - rankdir=LR; - nodesep=.05; - N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; -} -# second print -digraph G { - rankdir=LR; - nodesep=.05; - N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; - N1 [label="v1 = v0 * -0.98"]; - N0 -> N1; -} -# third print -digraph G { - rankdir=LR; - nodesep=.05; - N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; - N1 [label="v1 = v0 * -0.98"]; - N0 -> N1; - N2 [label="v2 = -1.88387 - v1"]; - N1 -> N2; - N3 [label="v3 = -v2"]; - N2 -> N3; - N4 [label="v4 = square(v3)"]; - N3 -> N4; -} -``` - -### Conclusion - - -Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features: - -- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter. -- Expression corresponds with a global Graph, and Expression can also be composed. -- Expression tracks all dependency and can be taken as a run target diff --git a/doc/fluid/design/others/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash deleted file mode 100755 index 35e6997abd17588e17a82d448918fc1b3bd7220e..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/images/graph_construction_example.bash +++ /dev/null @@ -1,11 +0,0 @@ -cat ./graph_construction_example.dot | \ - sed 's/color=red/color=red, style=invis/g' | \ - sed 's/color=green/color=green, style=invis/g' | \ - dot -Tpng > graph_construction_example_forward_only.png - -cat ./graph_construction_example.dot | \ - sed 's/color=green/color=green, style=invis/g' | \ - dot -Tpng > graph_construction_example_forward_backward.png - -cat ./graph_construction_example.dot | \ - dot -Tpng > graph_construction_example_all.png diff --git a/doc/fluid/design/others/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot deleted file mode 100644 index e115f9844bae6ad24f638c8ed4749cea8aff06a9..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/images/graph_construction_example.dot +++ /dev/null @@ -1,68 +0,0 @@ -digraph ImageClassificationGraph { - ///////// The forward part ///////// - FeedX [label="Feed", color=blue, shape=box]; - FeedY [label="Feed", color=blue, shape=box]; - InitW [label="Init", color=blue, shape=diamond]; - Initb [label="Init", color=blue, shape=diamond]; - FC [label="FC", color=blue, shape=box]; - MSE [label="MSE", color=blue, shape=box]; - - x [label="x", color=blue, shape=oval]; - l [label="l", color=blue, shape=oval]; - y [label="y", color=blue, shape=oval]; - W [label="W", color=blue, shape=doublecircle]; - b [label="b", color=blue, shape=doublecircle]; - cost [label="cost", color=blue, shape=oval]; - - FeedX -> x -> FC -> y -> MSE -> cost [color=blue]; - FeedY -> l [color=blue]; - InitW -> W [color=blue]; - Initb -> b [color=blue]; - W -> FC [color=blue]; - b -> FC [color=blue]; - l -> MSE [color=blue]; - - ////////// The backward part ///////// - MSE_Grad [label="MSE_grad", color=red, shape=box]; - FC_Grad [label="FC_grad", color=red, shape=box]; - - d_cost [label="d cost", color=red, shape=oval]; - d_y [label="d y", color=red, shape=oval]; - d_b [label="d b", color=red, shape=oval]; - d_W [label="d W", color=red, shape=oval]; - - cost -> MSE_Grad [color=red]; - d_cost -> MSE_Grad [color=red]; - l -> MSE_Grad [color=red]; - y -> MSE_Grad -> d_y [color=red]; - - x -> FC_Grad [color=red]; - y -> FC_Grad [color=red]; - d_y -> FC_Grad [color=red]; - W -> FC_Grad -> d_W [color=red]; - b -> FC_Grad -> d_b [color=red]; - - ////////// The optimizaiton part ////////// - - OPT_W [label="SGD", color=green, shape=box]; - OPT_b [label="SGD", color=green, shape=box]; - - W -> OPT_W [color=green]; - b -> OPT_b [color=green]; - d_W -> OPT_W -> W [color=green]; - d_b -> OPT_b -> b [color=green]; - - ////////// Groupings ////////// - - subgraph clusterMSE { - style=invis; - MSE; - MSE_Grad; - } - - subgraph clusterFC { - style=invis; - FC; - FC_Grad; - } -} diff --git a/doc/fluid/design/others/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png deleted file mode 100644 index 261611a5721f9aa97874f7e6d897fe48cf667db2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/others/images/graph_construction_example_all.png and /dev/null differ diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png deleted file mode 100644 index 4c69687f4a6a181138f3df72ce5e8aa48487b5be..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png and /dev/null differ diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png deleted file mode 100644 index e668c16e0cac73acb4e5dc2b1827557ae77126b4..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/others/images/graph_construction_example_forward_only.png and /dev/null differ diff --git a/doc/fluid/design/others/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md deleted file mode 100644 index a7ac3f17c44ca94a669a8f1e283b291bceb42317..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/parameters_in_cpp.md +++ /dev/null @@ -1,41 +0,0 @@ -# Design Doc: The C++ Class `Parameters` - -`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). - -We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation: -* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. -* We did not support sharing Parameters while training. We just trigger `memcpy` when start training. - -It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`: - -1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`. -It is evident that we should use `paddle::Parameter` when developing `Parameters`. -However, the `Parameter` class contains many functions and does not have a clear interface. -It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`. -When we developing `Parameters`, we only use `create/store Parameter` functionality. -We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation. - -2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`. -We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies. -Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs. -`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device. - -3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. -So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD). - - -The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one. - -1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters. - -2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member. - -3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies. -Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs. -`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`. - * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs. - * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler. - -4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies. - -5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear. diff --git a/doc/fluid/design/others/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md deleted file mode 100644 index c7aeed7f9b4637e1c29d530f37b42d12500af82f..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/simple_op_design.md +++ /dev/null @@ -1,202 +0,0 @@ -## Interaction between C++ and Python - -Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. - -The Interaction between Python and C++ can be simplified as two steps: - -1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time. - -2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task. - -### Message from C++ to Python - -We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.” - -Following message are necessary: - -1. Op's name, and its simple comment. -2. Input and output variable number; each variable's name, type, and comment. -3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**. - -So `OpProto` can be defined as follows: - -```proto -enum AttrType { - INT = 1; - FLOAT = 2; - STRING = 3; - INTS = 4; - FLOATS = 5; - STRINGS = 6; -}; - -message AttrValue { - AttrType type = 1; - optional int iv = 2; - optional float fv = 3; - optional string sv = 4; - repeated int ivs = 5; - repeated float fvs = 6; - repeated string svs = 7; -}; - -message AttrProto { - required string name = 1; - required string comment = 2; - required AttrType type = 3; -}; - -message VarProto { - required string name = 1; - required string comment = 2; - required bool is_tensor = 3; -}; - -message OpProto { - repeated VarProto inputs = 1; - repeated VarProto outputs = 2; - repeated AttrProto attrs = 3; - required string type = 4; - required string comment = 5; -}; -``` - -To generate Python code automatically: - -```python -def create_python_ops_creatation_functions(): - op_protos = paddle.framework.OpRegistry.get_all_op_proto() - for type_name in op_protos: - op_proto = op_protos[type_name] - def __impl__(**kwargs): # User must use key word args in Paddle API - inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs] - outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs] - attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs] - opdesc = (input, outputs, type_name, attrs) - return paddle.framework.OpRegistry.CreateOp(opdesc) - __impl__.__doc__ = create_doc_string(op_proto) - globals()[type_name] = __impl__ - -create_python_ops_creatation_functions() -``` - -### Message from Python to C++ - -To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing. - -```proto -message OpDesc { - required string type = 1; - repeated string inputs = 2; - repeated string outputs = 3; - map attrs = 4; -}; -``` - -## OpProto Register - -Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`. - -```cpp -class OpProtoMaker { -public: - OpProtoMaker(OpProto* proto): proto_(proto) {} -protected: - OpProto* proto_; - void AddInput(const std::string& name, const std::string& desc) {...} - void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...} - void AddComment(const std::string& comment) { ... } -}; - -class OpRegistry { -public: - using OpCreator = std::function; - - template - static void RegisterOp(const std::string& name) { - gCreators_[name] = [](const OpDesc& desc) { - return new OpType(desc); - }; - OpProto& opProto = gProtos_[name]; - OpMaker()(&opProto); - } - - static map gCreators_; - static map gProtos_; -}; - -template -class OpRegister { - public: - OpRegister(std::string type) { - OpRegistry::RegisterOp(type); - } -}; - -#define REGISTER_OP(op_class, op_maker_class, type_name) \ - class op_class##Register { \ - private: \ - const static OpRegister<#op_class, #op_maker_class> reg; \ - }; \ - const Register op_class##Register::reg(#type_name); - -class CosineOp { -// ... -} - -struct CosineOpProtoMaker : public OpProtoMaker { - CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { - AddInput("input", "input of cosine op"); - AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0); - AddType("cos"); - AddComment("This is cos op"); - } -} - -REGISTER_OP(CosineOp, CosineOpProtoMaker, cos); -``` - -In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. - -## Python API - -Python APIs are divided into two types, high-level API and low-level API. - -### High-Level API - -High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs. - -Here is a sample about how a define a fc layer: - -```python -hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid"); -``` - -`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input. - -The definition of `fc_layer()`: - -```python -def fc_layer(input, size, with_bias, activation): - attr_map = {"size":size} - check_attrs(attr_map) - w = make_variable('w') - if with_bias: - b = make_variable('b') - else: - b = None - fc_output = make_variable('fc_output'); - fc_op(input, w, b, fc_output, attr_map) - act_output = make_variable('sigmod_output'); - if activation == "sigmod": - sigmod_op(fc_output, act_output); - elif: - # ... - return act_output; -``` - -### Low Leval API - -In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code. - -*TODO* diff --git a/doc/fluid/design/others/test.dot b/doc/fluid/design/others/test.dot deleted file mode 100644 index 62c69b8fc8010a26a54a6ee8ef1488aad94d747a..0000000000000000000000000000000000000000 --- a/doc/fluid/design/others/test.dot +++ /dev/null @@ -1,35 +0,0 @@ - -digraph Test { - z -> generator -> G_img; - G_img -> discriminator -> D_f -> d_loss_f; - label0 -> d_loss_f -> d_loss; - - img -> discriminator -> D_t -> d_loss_t; - label1 -> d_loss_t -> d_loss; - - d_loss -> d_loss_t[color=red, style=dashed]; - d_loss -> d_loss_f[color=red, style=dashed]; - d_loss_t -> D_t[color=red, style=dashed]; - d_loss_f -> D_f[color=red, style=dashed]; - D_t -> discriminator[color=red, style=dashed]; - D_f -> discriminator[color=red, style=dashed]; - - D_f -> g_loss; - label2 -> g_loss; - - g_loss -> D_f[color=green, style=dashed]; - D_f -> discriminator[color=green, style=dashed]; - discriminator -> G_img[color=green, style=dashed]; - G_img -> generator[color=green, style=dashed]; - - discriminator [color=red, shape=box]; - generator [color=green, shape=box]; - z [shape=diamond]; - img [shape=diamond]; - label0 [shape=diamond]; - label1 [shape=diamond]; - label2 [shape=diamond]; - - d_loss [color=red]; - g_loss [color=green]; -} diff --git a/doc/fluid/design/others/test.dot.png b/doc/fluid/design/others/test.dot.png deleted file mode 100644 index 4e121a40b9f7b2232d7cdda315bad15926446f55..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/others/test.dot.png and /dev/null differ diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md deleted file mode 100644 index 085352fc5614d693e63a2f7241e868a9649456af..0000000000000000000000000000000000000000 --- a/doc/fluid/design/quantization/fixed_point_quantization.md +++ /dev/null @@ -1,110 +0,0 @@ -Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements. It is especially important for the inference in embedded-device deployment. - -According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed. - -This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale. - - -### How to quantize - -There are many ways to quantize the float value to fixed-point value. For example: - -$$ r = min(max(x, a), b)$$ -$$ s = \frac{b - a}{n - 1} $$ -$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$ - -where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. - - -The quantization we applied is parameterized by the number of quantization levels and maximum absolute value: - -$$ M = max(abs(x)) $$ -$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$ - -where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. - - -Wether the *min-max* quantization or *max-abs* quantization, they also can be represent: - -$q = scale * r + b$ - -We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range. - - -How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part. - - -### Training Framework - -#### Forward pass - -The forward pass is simulated quantization, see Figure 1. - -The training framework is as following figure. - -

-
-Figure 1. Forward in training with simulated quantization. -

- -- Firstly, both input and weight will be quantized to 8-bit integers. -- Second, do the multiplication (or convolution) operation with integers. -- Third, dequantize the multiplication (or convolution) results to 32-bit float point. -- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized. - -For general matrix multiplication (GEMM), quantize for $X$ and $W$: - -$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil $$ -$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$ - -Do GEMM: - -$$ Y = X_q * W_q $$ - - -Dequantize $Y$: - -$$ -\begin{align} -Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\ - &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\ - &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) -\end{align} -$$ - -From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework. - -

-
-Figure 2. Equivalent forward in training with simulated quantization. -

- -We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization. - -#### Backward pass - -See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights. - -

-
-Figure 3. Backward and weight updating in training with simulated quantization. -

- -So the quantization transipler will change some inputs of the corresponding backward operators. - -### How to calculate quantization scale - -There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs. - -For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished. - -For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them: - - -1. Calculate the mean of maximum absolute during a window. -2. Calculate the max of maximum absolute during a window. -3. Calculate the running mean of maximum absolute during a window, as follows: - - $$ Vt = (1 - k) * V + k * V_{t-1} $$ - - where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9. diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png deleted file mode 100644 index 84f8235ab87cb631992b691f8e05b9c0b6c93da2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/quantization/quantization_backward_and_optimization.png and /dev/null differ diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png deleted file mode 100644 index df49c864537c047c785da12d24893e54ce0a5341..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/quantization/quantization_equivalent_forward.png and /dev/null differ diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png deleted file mode 100644 index 0913f61621bb6533bcb10bd1d18120ccaaa96cff..0000000000000000000000000000000000000000 Binary files a/doc/fluid/design/quantization/quantization_forward.png and /dev/null differ diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md deleted file mode 100644 index 7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/api_doc_std_cn.md +++ /dev/null @@ -1,221 +0,0 @@ -# API注释撰写标准 - -- [API注释撰写标准](#api) - - [API注释模块](#api) - - [格式及示例](#) - - [完整示例](#) - - -## API注释模块 - -API文档须包含以下几个模块(排列顺序为文档撰写顺序): - -- Python API Definition - - API的代码定义。 - -- Function Description - - API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。 - -- Args Description - - API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。 - -- Returns - - API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。 - -- Raises(如果有) - - 可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。 - -- Note(如果有) - - 注意事项。当有多条注意事项时,应分条列出。 - -- Examples - - API的使用示例。 - - -## 格式及示例 - -API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明): - -- Python API Definition - - - 格式: - - [Python API Definition] - - - 示例 - - ``` - fc(input, - size, - num_flatten_dims=1, - param_attr=None, - bias_attr=None, - act=None, - name=None, - main_program=None, - startup_program=None) - ``` - -- Function Description - - - 格式 - - 本模块应包含以下内容(排列顺序为文档撰写顺序): - - [Function Description] - - [Formula] - - [Symbols' Descriptions if necessary] - - [References if necessary] - - - 示例 - - [Function Description] - - ``` - **Fully Connected Layer** - - The fully connected layer can take multiple tensors as its inputs. It - creates a variable called weights for each input tensor, which represents - a fully connected weight matrix from each input unit to each output unit. - The fully connected layer multiplies each input tensor with its coresponding - weight to produce an output Tensor. If multiple input tensors are given, - the results of multiple multiplications will be sumed up. If bias_attr is - not None, a bias variable will be created and added to the output. Finally, - if activation is not None, it will be applied to the output as well. - ``` - - [Formula] - - ``` - This process can be formulated as follows: - - .. math:: - - Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) - ``` - - [Symbols' Descriptions if necessary] - - ``` - In the above equation: - - * :math:`N`: Number of the input. - * :math:`X_i`: The input tensor. - * :math:`W`: The weights created by this layer. - * :math:`b`: The bias parameter created by this layer (if needed). - * :math:`Act`: The activation function. - * :math:`Out`: The output tensor. - ``` - - [References if necessary] - - 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例: - - ``` - Refer to `Layer Normalization `_ for more details. - ``` - - -- Args Description - - - 格式 - - \[Arg's Name\][(Data Type, Default Value)][Description] - - - 示例 - - fc的部分参数注释如下: - - ``` - Args: - input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of - the input tensor(s) is at least 2. - param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable - parameters/weights of this layer. - name (str, default None): The name of this layer. - ``` - -- Returns - - - 格式 - - [Name][Shape] - - - 示例 - - ``` - Returns: - A tensor variable storing the transformation result. - ``` - - 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例: - - ``` - Returns: - A tuple containing: - The hidden state of LSTM whose shape is (T X D). - The cell state of LSTM whose shape is (T X D). - ``` - -- Raises - - - 格式 - - [Exception Type][Condition] - - - 示例 - - ``` - Raises: - ValueError: If the rank of the input is less than 2. - ``` - -- Note - - - 格式 - - [Note] - - - 示例 - - fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例: - - ``` - Note: - 1. When num_heads > 1, three linear projections are learned respectively - to map input queries, keys and values into queries', keys' and values'. - queries', keys' and values' have the same shapes with queries, keys - and values. - 2. When num_heads == 1, scaled_dot_product_attention has no learnable - parameters. - ``` - -- Examples - - - 格式 - - \[Python Code Snipper] - - - 示例 - - ``` - Examples: - .. code-block:: python - - data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") - fc = fluid.layers.fc(input=data, size=1000, act="tanh") - ``` - -## 完整示例 - -fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md deleted file mode 100644 index f175b219750d1c765a6a111c2ec3aa732fa46175..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/api_doc_std_en.md +++ /dev/null @@ -1,227 +0,0 @@ -# API Doc Standard - -- [API Doc Standard](#api-doc-standard) - - [API Doc Structure](#api-doc-structure) - - [Format and Examples](#format-and-examples) - - [Complete Example](#complete-example) - - -## API Doc Structure - -API Doc should contain the following parts(please write them in order): - -- Python API Definition - - The definition of API - -- Function Description - - Description of API's function. - The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula. - -- Args Description - - Description of API parameters. - Introduce parameters one by one according to the order in API definition. - The introduction includes: data type, default value(if any), meaning, etc. - -- Returns - - Introduction of API returned value. - Introduce meaning of returned value, provide correspoding format if necessary. - If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order. - -- Raises(if any) - - Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. - -- Note(if any) - - Matters needing attention. If there are more than one matters, they should be listed in order. - -- Examples - - Examples of how to use API. - - -## Format and Examples - -API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html). -Format and examples of each part of API documantation are as follows: (take fc for example) - -- Python API Definition - - - Format - - [Python API Definition] - - - Example - - ``` - fc(input, - size, - num_flatten_dims=1, - param_attr=None, - bias_attr=None, - act=None, - name=None, - main_program=None, - startup_program=None) - ``` - -- Function Description - - - Format - - This part contains (please write them in order): - - [Function Description] - - [Formula] - - [Symbols' Descriptions if necessary] - - [References if necessary] - - - Example - - [Function Description] - - ``` - **Fully Connected Layer** - - The fully connected layer can take multiple tensors as its inputs. It - creates a variable called weights for each input tensor, which represents - a fully connected weight matrix from each input unit to each output unit. - The fully connected layer multiplies each input tensor with its coresponding - weight to produce an output Tensor. If multiple input tensors are given, - the results of multiple multiplications will be sumed up. If bias_attr is - not None, a bias variable will be created and added to the output. Finally, - if activation is not None, it will be applied to the output as well. - ``` - - [Formula] - - ``` - This process can be formulated as follows: - - .. math:: - - Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) - ``` - - [Symbols' Descriptions if necessary] - - ``` - In the above equation: - - * :math:`N`: Number of the input. - * :math:`X_i`: The input tensor. - * :math:`W`: The weights created by this layer. - * :math:`b`: The bias parameter created by this layer (if needed). - * :math:`Act`: The activation function. - * :math:`Out`: The output tensor. - ``` - - [References if necessary] - - Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: - - ``` - Refer to `Layer Normalization `_ for more details. - ``` - - -- Args Description - - - Format - - \[Arg's Name\][(Data Type, Default Value)][Description] - - - Example - - part of fc parameters are as follows: - - ``` - Args: - input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of - the input tensor(s) is at least 2. - param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable - parameters/weights of this layer. - name (str, default None): The name of this layer. - ``` - -- Returns - - - Format - - [Name][Shape] - - - Example - - ``` - Returns: - A tensor variable storing the transformation result. - ``` - - when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example: - - ``` - Returns: - A tuple containing: - The hidden state of LSTM whose shape is (T X D). - The cell state of LSTM whose shape is (T X D). - ``` - -- Raises - - - Format - - [Exception Type][Condition] - - - Example - - ``` - Raises: - ValueError: If the rank of the input is less than 2. - ``` - -- Note - - - Format - - [Note] - - - Example - - there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example: - - ``` - Note: - 1. When num_heads > 1, three linear projections are learned respectively - to map input queries, keys and values into queries', keys' and values'. - queries', keys' and values' have the same shapes with queries, keys - and values. - 2. When num_heads == 1, scaled_dot_product_attention has no learnable - parameters. - ``` - -- Examples - - - Format - - \[Python Code Snipper] - - - Example - - ``` - Examples: - .. code-block:: python - - data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") - fc = fluid.layers.fc(input=data, size=1000, act="tanh") - ``` - -## Complete Example - -Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png deleted file mode 100644 index 232762b82a9ae3e979a1f38a7beb715c87438f40..0000000000000000000000000000000000000000 Binary files a/doc/fluid/dev/ci_build_whl.png and /dev/null differ diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md deleted file mode 120000 index 955216ca62e71b4d3666e1662aa86c9495d2e7d6..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/contribute_to_paddle_cn.md +++ /dev/null @@ -1 +0,0 @@ -../../v2/dev/contribute_to_paddle_cn.md \ No newline at end of file diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md deleted file mode 120000 index f9fc68c37e17a8a365b0d7fae86c16b0d094631f..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/contribute_to_paddle_en.md +++ /dev/null @@ -1 +0,0 @@ -../../v2/dev/contribute_to_paddle_en.md \ No newline at end of file diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst deleted file mode 100644 index 37e608160db0ad5a92297987937bbbfa8f842ea8..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/index_cn.rst +++ /dev/null @@ -1,16 +0,0 @@ -开发标准 ------------- - -.. toctree:: - :maxdepth: 1 - - contribute_to_paddle_cn.md - write_docs_cn.md - api_doc_std_cn.md - new_op_cn.md - new_op_kernel.md - use_eigen_cn.md - name_convention.md - support_new_device.md - releasing_process_cn.md - op_markdown_format.md diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst deleted file mode 100644 index d7f83035010f13c30514673ecbee301f194dc175..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/index_en.rst +++ /dev/null @@ -1,16 +0,0 @@ -Development ------------- - -.. toctree:: - :maxdepth: 1 - - contribute_to_paddle_en.md - write_docs_en.md - api_doc_std_en.md - new_op_en.md - new_op_kernel.md - use_eigen_en.md - name_convention.md - support_new_device.md - releasing_process_en.md - op_markdown_format.md diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md deleted file mode 100644 index 6b4244d0f506c8cd6c08739141eabad27c581ca7..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/name_convention.md +++ /dev/null @@ -1,65 +0,0 @@ -# Operator's Parameter Name Convention - -To make the operator document itself more clear, we recommend operator names obey the listing conventions. - -## OpProtoMaker names - -When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator. - -- Input/Output. - - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words. - - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified. - -- Attribute. - - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words. - -- Comments. - - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g. Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`. - - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`. - -- Order. - - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice. - -## Best Practice - -Here we give some examples to show how these rules will be used. - -- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`. - -- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`. - -- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`. - - We give a full example of Accumulator Operator. - -```c++ -class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker { -public: - AccumulateOpMaker(OpProto *proto, - OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. - If the output size is not the same as input size, - the output tensor is first reshaped and initialized to zero, and only then, accumulation is done."); - AddOutput("Out", "(Tensor) Accumulated output tensor"); - AddAttr("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f); - AddComment(R"DOC( -Accumulate Operator. - -This operator accumulates the input tensor to the output tensor. If the -output tensor already has the right size, we add to it; otherwise, we first -initialize the output tensor to all zeros, and then do accumulation. Any -further calls to the operator, given that no one else fiddles with the output -in the interim, will do simple accumulations. - -Accumulation is done as follows: - -Out = 1*X + gamma*Out - -where X is the input tensor, Out is the output tensor and gamma is the multiplier -argument. - -)DOC"); - } -}; -``` diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md deleted file mode 100644 index ff7408111fa20a7a6a3a2fe9f9ba20835918f399..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/new_op_cn.md +++ /dev/null @@ -1,435 +0,0 @@ -# 如何写新的Operator - - - [概念简介](#概念简介) - - [实现C++类](#实现c类) - - [定义ProtoMaker类](#定义protomaker类) - - [定义Operator类](#定义operator类) - - [定义OpKernel类](#定义opkernel类) - - [注册Operator](#注册operator) - - [编译](#编译) - - [绑定Python](#绑定python) - - [实现单元测试](#实现单元测试) - - [前向Operator单测](#前向operator单测) - - [反向Operator单测](#反向operator单测) - - [编译和执行](#编译和执行) - - [注意事项](#注意事项) - - -## 概念简介 - -简单介绍需要用到基类,详细介绍请参考设计文档。 - -- `framework::OperatorBase`: Operator(简写,Op)基类。 -- `framework::OpKernel`: Op计算函数的基类,称作Kernel。 -- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。 -- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成 - -依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下: - - - - - - - - - - - - - - - - - - - - - - - - - - -
内容定义位置
OpProtoMake定义 .cc 文件,Backward Op不需要定义OpProtoMake
Op定义 .cc 文件
Kernel实现 CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。
注册Op Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中
- - -实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** - - -下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。 - - -## 实现C++类 - - -### 定义ProtoMaker类 - -矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。 - -首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释: - -```cpp -class MulOpMaker : public framework::OpProtoAndCheckerMaker { - public: - MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor), 2D tensor of size (M x K)"); - AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); - AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); - AddComment(R"DOC( -Two Element Mul Operator. -The equation is: Out = X * Y -)DOC"); - } -}; -``` - -[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数: - - - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。 - - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。 - -构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。 - -上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。 - - -再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例: - -```cpp -template -class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor) Input tensor of scale operator."); - AddOutput("Out", "(Tensor) Output tensor of scale operator."); - AddComment(R"DOC( -Scale operator -$$Out = scale*X$$ -)DOC"); - AddAttr("scale", - "(float, default 1.0)" - "The scaling factor of the scale operator.") - .SetDefault(1.0); - } -}; -``` - -这个例子有`AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 - -### 定义GradProtoMaker类 -每个Op的必须有一个对应的GraProtoMaker,若未定制对应前向Op的GradProtoMaker,fluid提供了DefaultGradProtoMaker,默认注册会使用全部输入输出,包括Input, Output, Output@Grad等,使用不需要的变量的会造成显存浪费。 -下面示例定义了ScaleOp的GradProtoMaker。 - -```cpp -class ScaleGradMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - std::unique_ptr Apply() const override { - auto *grad_op = new framework::OpDesc(); - grad_op->SetType("scale"); - grad_op->SetInput("X", OutputGrad("Out")); - grad_op->SetOutput("Out", InputGrad("X")); - grad_op->SetAttr("scale", GetAttr("scale")); - return std::unique_ptr(grad_op); - } -}; -``` - -### 定义Operator类 - -下面实现了MulOp的定义: - -```cpp -class MulOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(const framework::InferShapeContext &ctx) const override { - auto dim0 = ctx.Input("X")->dims(); - auto dim1 = ctx.Input("Y")->dims(); - PADDLE_ENFORCE_EQ(dim0.size(), 2, - "input X(%s) should be a tensor with 2 dims, a matrix", - ctx.op_.Input("X")); - PADDLE_ENFORCE_EQ(dim1.size(), 2, - "input Y(%s) should be a tensor with 2 dims, a matrix", - ctx.op_.Input("Y")); - PADDLE_ENFORCE_EQ( - dim0[1], dim1[0], - "First matrix's width must be equal with second matrix's height."); - ctx.Output("Out")->Resize({dim0[0], dim1[1]}); - } -}; -``` - -[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员: - -```cpp -using framework::OperatorWithKernel::OperatorWithKernel; -``` - -这句表示使用基类`OperatorWithKernel`的构造函数,也可写成: - -```cpp -MulOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} -``` - -还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是: - - - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。 - - 2). 设置输出Tensor的形状。 - -通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中 - -### 定义OpKernel类 - -`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: - -- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。 - -- `typename T` : 表示数据类型,如`float`, `double`等。 - -需要为`MulKernel`类重写`Compute`接口。 -- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。 -- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。 -- `Compute`函数里实现`OpKernel`的具体计算逻辑。 - -下面是 `MulKernel` `Compute`的实现: - - ```cpp - template - class MulKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* Z = context.Output("Out"); - Z->mutable_data(context.GetPlace()); - auto& device_context = context.template device_context(); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); - } - }; - ``` - -需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** - -`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。 - -为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。 - -到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 -反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。 - -### 注册Operator - -- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 - - ```cpp - namespace ops = paddle::operators; - REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, - paddle::framework::DefaultGradOpDescMaker) - REGISTER_OPERATOR(mul_grad, ops::MulGradOp) - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); - ``` - - 在上面的代码中: - - - `REGISTER_OPERATOR` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。 - - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。 - - -- 在 `.cu`文件中注册CUDA Kernel。 - - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: - - ```cpp - // if use Eigen unsupported module before include head files - #define EIGEN_USE_GPU - - namespace ops = paddle::operators; - REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); - REGISTER_OP_CUDA_KERNEL(mul_grad, - ops::MulGradKernel); - ``` - -### 编译 - -运行下面命令可以进行编译: - -``` -make mul_op -``` - -## 绑定Python - -系统会对新增的op自动绑定Python,并链接到生成的lib库中。 - -## 实现单元测试 - -单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。 - -### 前向Operator单测 - -Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: - -1. 在`setUp`函数定义输入、输出,以及相关的属性参数。 -2. 生成随机的输入数据。 -3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。 -4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。 - - - ```python - import unittest - import numpy as np - from op_test import OpTest - - - class TestMulOp(OpTest): - def setUp(self): - self.op_type = "mul" - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } - self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) - - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - ``` - -上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: - -- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 -- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 -- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 - -### 反向operator单测 - -而反向测试中: -- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 - - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 - - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。 - - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。 -- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。 - - -### 编译和执行 - -`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。 - -请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试: - -```bash -make test ARGS="-R test_mul_op -V" -``` - -或者: - -```bash -ctest -R test_mul_op -``` - -## 注意事项 - -- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。 -- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 -- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 - -### PADDLE_ENFORCE使用注意 - -实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义,基本格式如下: - -``` -PADDLE_ENFORCE(表达式, 错误提示信息) -PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息) -``` - -如果表达式为真,或者比较对象A=B,则检查通过,否则会终止程序运行,向用户反馈相应的错误提示信息。 -为了确保提示友好易懂,开发者需要注意其使用方法。 - -#### 总体原则 - -任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方,必须有详略得当的备注解释!**错误提示信息**不能为空! - -#### 提示信息书写标准 - -1. [required] 哪里错了?为什么错了? - - 例如:`ValueError: Mismatched label shape` -2. [optional] 期望的输入是什么样的?实际的输入是怎样的? - - 例如:`Expected labels dimension=1. Received 4.` -3. [optional] 能否给出修改意见? - - 例如:`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.` - -如果并非必要或者简洁的描述即可表达清楚以上要点,根据情况书写亦可。 - -##### FAQ 典型问题 - -1. 无报错信息或报错信息过于简单,不能给用户提供有效的提示! - -问题示例1 :未写提示信息 -``` -PADDLE_ENFORCE(ctx->HasInput("X"), ""); -``` -问题示例2 :提示信息过于简单 -``` -PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么? -``` - -2. 在报错信息中使用开发人员定义的变量缩写,不易理解! - -问题示例: -``` -PADDLE_ENFORCE(forward_pd != nullptr, - "Fail to find eltwise_fwd_pd in device context"); //eltwise_fwd_pd用户可能看不懂 -``` - -3. OP内部调用非法接口:Op内部如果出现Output = ShareDataWith(Input) -问题示例: -```cpp -auto *out = ctx.Output("Out"); -auto *in = ctx.Input("X"); -out->ShareDataWith(*in); -``` -Op内部如果出现Output = ShareDataWith(Input),相当于operator图的中有一条隐藏边,连接了Input和Output,这条边无法在图分析中表达,引发基于图优化的错误。 - -4. OP实现的性能实践 -调用了eigen的broadcast, chop等操作,性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen,gpu实现可以实现cuda kernel. - - -#### OP InferShape检查提示信息特别说明 - -- 检查输入输出变量,请统一遵循以下格式 -`Input(变量名) of OP名 operator should not be null.` - -正确示例: -``` -PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input(Input) of LSTMP operator should not be null."); -``` - -- 反向Op的输入输出检查,要写明反向Op的名字 - -正确示例: -``` -PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of LoDResetGrad opreator should not be null."); -``` diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md deleted file mode 100644 index f8de271ed4e5e0fb4018478bffd4b525d4319738..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/new_op_en.md +++ /dev/null @@ -1,352 +0,0 @@ -# How to write a new operator - - - [Background](#background) - - [Implementing C++ Types](#implementing-c-types) - - [Defining ProtoMaker](#defining-protomaker) - - [Defining Operator](#defining-operator) - - [Defining OpKernel](#defining-opkernel) - - [Registering Operator and OpKernel](#registering-operator-and-opkernel) - - [Compilation](#compilation) - - [Python Binding](#python-binding) - - [Unit Tests](#unit-tests) - - [Testing Forward Operators](#testing-forward-operators) - - [Testing Backward Operators](#testing-backward-operators) - - [Compiling and Running](#compiling-and-running) - - [Remarks](#remarks) -## Background - -Here are the base types needed. For details, please refer to the design docs. - -- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API. -- `framework::OperatorBase`: Operator (Op)base class. -- `framework::OpKernel`: Base class for Op computation kernel. -- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels. - - -Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information: - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Information Where is it defined
OpProtoMake definition `.cc`files, Backward Op does not need an OpProtoMake interface.
Op definition `.cc` files
Kernel implementation The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
Registering the Op Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
- - -New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.** - - -Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel. - - -## Implementing C++ Types - - -### Defining ProtoMaker - -Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. - -First, define `ProtoMaker` to describe the Operator's input, output, and additional comments: - -```cpp -class MulOpMaker : public framework::OpProtoAndCheckerMaker { - public: - MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "(Tensor), 2D tensor of size (M x K)"); - AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); - AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); - AddComment(R"DOC( -Two Element Mul Operator. -The equation is: Out = X * Y -)DOC"); - } -}; -``` - -[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor: - - - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces. - - `framework::OpAttrChecker` is used to validate variable attributes. - -The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`. - -The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md). - - -An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows: - -```cpp -template -class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor of scale operator.").NotInGradient(); - AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); - AddComment(R"DOC(Scale operator -The equation is: Out = scale*X -)DOC"); - AddAttr("scale", "scale of scale operator.").SetDefault(1.0); - } -}; -``` - -Note `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. - - -### Defining Operator - -The following code defines the interface for MulOp: - -```cpp -class MulOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(const framework::InferShapeContext &ctx) const override { - auto dim0 = ctx.Input("X")->dims(); - auto dim1 = ctx.Input("Y")->dims(); - PADDLE_ENFORCE_EQ(dim0.size(), 2, - "input X(%s) should be a tensor with 2 dims, a matrix", - ctx.op_.Input("X")); - PADDLE_ENFORCE_EQ(dim1.size(), 2, - "input Y(%s) should be a tensor with 2 dims, a matrix", - ctx.op_.Input("Y")); - PADDLE_ENFORCE_EQ( - dim0[1], dim1[0], - "First matrix's width must be equal with second matrix's height."); - ctx.Output("Out")->Resize({dim0[0], dim1[1]}); - } -}; -``` - -[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member - -```cpp -using framework::OperatorWithKernel::OperatorWithKernel; -``` - -expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as - -```cpp -MulOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} -``` - -`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to - - - 1). validate and error out early: it checks input data dimensions and types. - - 2). configures the tensor shape in the output. - -Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. - -### Defining OpKernel - -`MulKernel` inherits `framework::OpKernel`, which includes the following templates: - -- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43). - -- `typename T` denotes data type, such as `float` or `double`. - -`MulKernel` types need to rewrite the interface for `Compute`. - -- `Compute` takes one input parameter: `const framework::ExecutionContext& context`. -- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables. -- `Compute` implements the computation logics of an `OpKernel`. - -`MulKernel`'s implementation of `Compute` is as follows: - - ```cpp - template - class MulKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* Z = context.Output("Out"); - Z->mutable_data(context.GetPlace()); - auto& device_context = context.template device_context(); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); - } - }; - ``` - -Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.** - -`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc). - -To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md). - - -This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file. - -The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. - -### Registering Operator and OpKernel - -- In `.cc` files, register forward and backward operator classes and the CPU kernel. - - ```cpp - namespace ops = paddle::operators; - REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, - paddle::framework::DefaultGradOpDescMaker) - REGISTER_OPERATOR(mul_grad, ops::MulGradOp) - - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); - ``` - - In that code block, - - - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`. - - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient. - - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. - - -- Registering CUDA Kernel in `.cu` files - - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as - - ```cpp - // if use Eigen unsupported module before include head files - #define EIGEN_USE_GPU - - namespace ops = paddle::operators; - REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); - REGISTER_OP_CUDA_KERNEL(mul_grad, - ops::MulGradKernel); - ``` - -### Compilation - -Run the following commands to compile. - -``` -# maybe you need to rerun cmake -make mul_op -``` - -## Python Binding - -The system will automatically bind to Python and link it to a generated library. - -## Unit Tests - -Unit tests for an operator include - -1. comparing a forward operator's implementations on different devices, - -2. comparing a backward operator's implementation on different devices, and - -3. a scaling test for the backward operator. - -Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py). - -### Testing Forward Operators - -A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following: - -1. Defining input, output and relevant attributes in `setUp` method. - -2. Generating random input data. - -3. Implementing the same computation logic in a Python script. - -4. Call check gradient function to check the backward operator. - - ```python - import unittest - import numpy as np - from op_test import OpTest - - - class TestMulOp(OpTest): - def setUp(self): - self.op_type = "mul" - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } - self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) - - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - ``` -Get its output, and compare it with the forward operator's own output. - -The code above first loads required packages. In addition, we have - -- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type. -- `self.inputs` defines input, with type `numpy.array` and initializes it. -- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. - -### Testing Backward Operators - -Some key points in checking gradient above include: - -- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. - - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. - - The second variable `"Out"` points to the network's final output target `Out`. - - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests. -- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input. - -### Compiling and Running - - -Any new unit testing file of the format `test_*.py` added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile. - -Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`. - -After successfully compiling the project, run the following command to run unit tests: - -```bash -make test ARGS="-R test_mul_op -V" -``` - -Or, - -```bash -ctest -R test_mul_op -``` - -## Remarks - -- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures. -- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. -- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md deleted file mode 100644 index 87e617d44041bde9c9051151878ffb4304689b3c..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/new_op_kernel.md +++ /dev/null @@ -1,121 +0,0 @@ -# Add Kernels for a New Device - -## Background - -PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU. - -[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md). - -## Write Kernels for A New Device - -### Add A New Device - - For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24). We will correct this ASAP. - -To register a new device, we need to add an enum value to `LibraryType`: - -``` -enum class LibraryType { - kPlain = 0, - kMKLDNN = 1, - kCUDNN = 2, -}; -``` - - -### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53) - -If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`: - -```cpp -struct CUDAPlace { - CUDAPlace() : CUDAPlace(0) {} - explicit CUDAPlace(int d) : device(d) {} - - inline int GetDeviceId() const { return device; } - // needed for variant equality comparison - inline bool operator==(const CUDAPlace &o) const { - return device == o.device; - } - inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } - - int device; -}; - -typedef boost::variant Place; -``` - -### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37)) -After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it. - -```cpp -class DeviceContext { - public: - virtual ~DeviceContext() {} - virtual Place GetPlace() const = 0; - - virtual void Wait() const {} -}; -``` - -### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device. - -A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) - -```cpp -class OpKernelBase { - public: - /** - * ExecutionContext is the only parameter of Kernel Run function. - * Run will get input/output variables, state such as momentum and - * device resource such as CUDA stream, cublas handle, etc. from - * ExecutionContext. User should construct it before run the Operator. - */ - - virtual void Compute(const ExecutionContext& context) const = 0; - - virtual ~OpKernelBase() = default; -}; - -template -class OpKernel : public OpKernelBase { - public: - using ELEMENT_TYPE = T; -}; -``` - - -### Register the OpKernel to framework - -After writing the components described above, we should register the kernel to the framework. - -We use `REGISTER_OP_KERNEL` to do the registration. - -```cpp -REGISTER_OP_KERNEL( - op_type, - library_type, - place_type, - kernel0, kernel1, ...) -``` - -kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`. - -take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example: - - ```cpp - REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace, - paddle::operators::GemmConvKernel, - paddle::operators::GemmConvKernel); - - REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); - ``` - -In the code above: - - - `conv2d` is the type/name of the operator - - `CUDNN/CPU` is `library` - - `paddle::platform::CUDAPlace/CPUPlace` is `place` - - template parameter `float/double` on `CUDNNConvOpKernel` is `data_type`. diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md deleted file mode 100644 index 4e539d7992e5f67ee7b07193b59b6b425b73c9e5..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/op_markdown_format.md +++ /dev/null @@ -1,64 +0,0 @@ -# Standard Markdown Format for Operators -The following should be the standard format for documentation for all the operators that will get rendered in the `html`: - -``` -Operator Name (In PaddlePaddle) - -Operator Name (Standard) - -Operator description. - -LaTeX equation of how the operator performs an update. - -The signature of the operator. -``` - -Each section mentioned above has been covered in further detail in the rest of the document. - -## PaddlePaddle Operator Name -This should be in all small letters, in case of multiple words, we separate them with an underscore. For example: -`array to lod tensor` should be written as `array_to_lod_tensor`. - -This naming convention should be standard across all PaddlePaddle operators. - -## Standard Operator Name -This is the standard name of the operator as used in the community. The general standard is usually: -- Standard abbreviations like `SGD` are written in all capital letters. -- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word). -- Keep numbers inside a word as is, with no boundary delimiters. -- Follow the name of the operator with the keyword: `Activation Operator.` - -## Operator description -This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section. - -## LaTeX equation -This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`). - -## The signature -This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is: -`Section : -VariableName : (VariableType) VariableDescription -... -... -` - - -The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`: - -``` -sgd - -SGD operator - -This operator implements one step of the stochastic gradient descent algorithm. - -param_out = param_learning_rate * grad - -Inputs: -Param : (Tensor) Input parameter -LearningRate : (Tensor) Learning rate of SGD -Grad : (Tensor) Input gradient - -Outputs: -ParamOut : (Tensor) Output parameter -``` diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md deleted file mode 100644 index acea9a2b5df903a958edf3683900e165670e196f..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/releasing_process_cn.md +++ /dev/null @@ -1,195 +0,0 @@ -# PaddlePaddle发行规范 - -PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 - -PaddlePaddle每次发新的版本,遵循以下流程: - -1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`。 -3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 -4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。 -5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 -6. release分支稳定后,打上正式的release tag,比如`0.10.0`。 -7. 将这个版本的python wheel包发布到pypi。 -8. 更新Docker镜像(参考后面的操作细节)。 - -需要注意的是: - -* bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。 - -* release分支原则上只接受修复类的修改,不接受新feature。 - -## 发布wheel包到pypi - -1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) -完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以 -弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。 - -1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。 -1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 -1. 上传: -``` -cd build/python -pip install twine -twine upload dist/[package to upload] -``` - -* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux - 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。 -* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。 - -## 发布Docker镜像 - -上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上 -版本号对应的tag即可: - -``` -docker pull [镜像]:latest -docker tag [镜像]:latest [镜像]:[version] -docker push [镜像]:[version] -``` - -需要更新的镜像tag包括: - -* `[version]`: CPU版本 -* `[version]-openblas`: openblas版本 -* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5) -* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像 - -之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。 - -## PaddlePaddle 分支规范 - -PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 - -* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 -* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 -* `master`分支因为历史原因,已经废弃。 - -* 其他开发者fork的feature branch。 - * 建议,开发者的feature branch需要同步主版本库的`develop`分支。 - * 建议,开发者的feature branch需要基于主版本库中的`develop`分支。 - * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 - -## PaddlePaddle回归测试列表 - -TODO - -### PaddlePaddle Book中所有章节 - -PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
新手入门章节 识别数字 图像分类词向量 情感分析语意角色标注 机器翻译个性化推荐
API.V2 + Docker + GPU
API.V2 + Docker + CPU
`paddle_trainer` + Docker + GPU
`paddle_trainer` + Docker + CPU
API.V2 + Ubuntu + GPU
API.V2 + Ubuntu + CPU
`paddle_trainer` + Ubuntu + GPU
`paddle_trainer` + Ubuntu + CPU
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md deleted file mode 100644 index 00650946ff2e658cfad0e63a8f1e008902a2d36e..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/releasing_process_en.md +++ /dev/null @@ -1,228 +0,0 @@ -# PaddlePaddle Releasing Process - -PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics. - -Each time we release a new PaddlePaddle version, we should follow the below steps: - -1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` -2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。 -3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. -4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable. -5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. -6. After release branch is stable,Create the official release tag,such as `0.10.0`. -7. Release the python wheel package to pypi. -8. Update the docker image (More details below). - -NOTE: - -* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch. - -* release normally only accept bug fixes. Don't add new features. - - -## Publish Wheel Packages to pypi - -1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) - to build all wheel packages needed to publish. As shown in the following picture, choose a build - version, click "..." button on the right side of "Run" button, and switch to the second tab in the -pop-up box, choose the current release branch and click "Run Build" button. You may repeat this - step to start different versions of builds. - -1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`. -1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we - upload the package using `twine`, we need to rename the package from `linux_x86_64` to - `manylinux1_x86_64`. -1. Start the upload: - ``` - cd build/python - pip install twine - twine upload dist/[package to upload] - ``` - -* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can - download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using - scripts under `tools/manylinux1`. -* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the - old version. you must change the version number before upload a new one. - -### Publish wheel Packages for MacOS - -You need to build the binary wheel package for MacOS before publishing, to -make sure that the package can be used by many versions of MacOS -(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.), -you must build the package ***exactly*** following below steps: - -Build steps: - -1. install python from python.org downloads, and make sure it's currently in use - in your system. -1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions. -1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build` -1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org -1. `make -j` -1. `pip install delocate` -1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl` - -Then the whl under `fixed_wheel` is ready to upload. - -Install steps: - -1. run `pip install paddlepaddle...whl` -1. find the `libpython.dylib` that are currently in use: - - for python.org package installs, do nothing. - - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path` - -## Publish Docker Images - -Our CI tool will push latest images to DockerHub, so we only need to push a version tag like: - -``` -docker pull [image]:latest -docker tag [image]:latest [image]:[version] -docker push [image]:[version] -``` - -Tags that need to be updated are: -* `[version]`: CPU only version image -* `[version]-openblas`: openblas version image -* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5) -* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions - -You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/. - -## Branching Model - -PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model. - -* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests. -* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release. -* `master` branch as been deprecated for historical reasons - -* Developer's feature branch。 - * Developer's feature branch should sync with upstream `develop` branch. - * Developer's feature branch should be forked from upstream `develop` branch. - * After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review. - * In the review process, develop modify codes and push to their own feature branch. - -## PaddlePaddle Regression Test List - -TODO - -### All Chapters of PaddlePaddle Book - -We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including -V1 (`paddle_trainer` training) and V2 training and Fluid training. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Linear RegressionRecognize DigitsImage ClassificationWord2VecPersonalized RecommendationSentiment AnalysisSemantic Role LabelingMachine Translation
API.V2 + Docker + GPU
API.V2 + Docker + CPU
`paddle_trainer` + Docker + GPU
`paddle_trainer` + Docker + CPU
API.V2 + Ubuntu + GPU
API.V2 + Ubuntu + CPU
`paddle_trainer` + Ubuntu + GPU
`paddle_trainer` + Ubuntu + CPU
diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py deleted file mode 100644 index 3b074821cc2276a29b2a8639e82199fcf4d72020..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/src/fc.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def fc(input, - size, - num_flatten_dims=1, - param_attr=None, - bias_attr=None, - act=None, - name=None): - """ - **Fully Connected Layer** - - The fully connected layer can take multiple tensors as its inputs. It - creates a variable called weights for each input tensor, which represents - a fully connected weight matrix from each input unit to each output unit. - The fully connected layer multiplies each input tensor with its coresponding - weight to produce an output Tensor. If multiple input tensors are given, - the results of multiple multiplications will be sumed up. If bias_attr is - not None, a bias variable will be created and added to the output. Finally, - if activation is not None, it will be applied to the output as well. - - This process can be formulated as follows: - - .. math:: - - Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) - - In the above equation: - - * :math:`N`: Number of the input. - * :math:`X_i`: The input tensor. - * :math:`W`: The weights created by this layer. - * :math:`b`: The bias parameter created by this layer (if needed). - * :math:`Act`: The activation function. - * :math:`Out`: The output tensor. - - Args: - input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of - the input tensor(s) is at least 2. - size(int): The number of output units in this layer. - num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than - two dimensions. If this happens, the multidimensional tensor will first be flattened - into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input - tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) - dimensions will be flatten to form the first dimension of the final matrix (height of - the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to - form the second dimension of the final matrix (width of the matrix). For example, suppose - `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. - Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. - param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable - parameters/weights of this layer. - bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias - of this layer. If it is set to None, no bias will be added to the output units. - act (str, default None): Activation to be applied to the output of this layer. - name (str, default None): The name of this layer. - - Returns: - A tensor variable storing the transformation result. - - Raises: - ValueError: If rank of the input tensor is less than 2. - - Examples: - .. code-block:: python - - data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") - fc = fluid.layers.fc(input=data, size=1000, act="tanh") - """ diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md deleted file mode 100644 index 051a463cfcf97df2e2d5b6a880923ca70fefbd6e..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/support_new_device.md +++ /dev/null @@ -1,240 +0,0 @@ -# Design Doc: Supporting new Device/Library - -## Background - -Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner. - -On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. - -On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent. - -So, how to support a new Device/Library in Fluid becomes a challenge. - - -## Basic: Integrate A New Device/Library - -For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md). - -There are mainly three parts that we have to consider while integrating a new device/library: - -- Place and DeviceContext: indicate the device id and manage hardware resources - -- Memory and Tensor: malloc/free data on certain device - -- Math Functor and OpKernel: implement computing unit on certain devices/libraries - -### Place and DeviceContext - -Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. - -#### Place -Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`. - -``` - | CPUPlace -Place --| CUDAPlace - | FPGAPlace -``` - -And `Place` is defined as follows: - -``` -typedef boost::variant Place; -``` - -#### DeviceContext - -Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`. - - -``` - /-> CPUDeviceContext -DeviceContext ----> CUDADeviceContext - \-> FPGADeviceContext -``` - -An example of Nvidia GPU is as follows: - -- DeviceContext - - -``` -class DeviceContext { - virtual Place GetPlace() const = 0; -}; -``` - - -- CUDADeviceContext - - -``` -class CUDADeviceContext : public DeviceContext { - Place GetPlace() const override { return place_; } -private: - CUDAPlace place_; - cudaStream_t stream_; - cublasHandle_t cublas_handle_; - std::unique_ptr eigen_device_; // binds with stream_ -}; -``` - -### Memory and Tensor - - -#### memory module - -Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36): - -``` -template -void* Alloc(Place place, size_t size); - -template -void Free(Place place, void* ptr); - -template -size_t Used(Place place); -``` - -To implement these interfaces, we have to implement MemoryAllocator for different Devices. - - -#### Tensor - -[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place. - -```cpp -class Tensor { - public: - /*! Return a pointer to mutable memory block. */ - template - inline T* data(); - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(platform::Place place); - - /** - * @brief Return a pointer to mutable memory block. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(DDim dims, platform::Place place); - - /*! Resize the dimensions of the memory block. */ - inline Tensor& Resize(const DDim& dims); - - /*! Return the dimensions of the memory block. */ - inline const DDim& dims() const; - - private: - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /*! points to dimensions of memory block. */ - DDim dim_; -}; -``` - -`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory. - -```cpp -paddle::framework::Tensor t; -paddle::platform::CPUPlace place; -// set size first -t.Resize({2, 3}); -// allocate memory on CPU later -t.mutable_data(place); -``` - - - -### Math Functor and OpKernel - -Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors. - -Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example: - -The interface is defined in the header file. - -``` -template -class MaxOutFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - framework::Tensor* output, int groups); -}; -``` - -CPU implementation is in .cc file - -``` -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - int groups) { - ... - } -}; -``` - -CUDA implementation is in .cu file - -``` -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - int groups) { - ... - } -}; -``` - - -We first obtain the computing handle from a concrete DeviceContext and then compute on tensors. - -The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map. - -Fluid provides different register interfaces in op_registry.h - - -Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example: - -In .cc file: - -``` -REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_CPU_KERNEL( - crop_grad, ops::CropGradKernel); -``` - -In .cu file: - -``` -REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); -REGISTER_OP_CUDA_KERNEL( - crop_grad, ops::CropGradKernel); -``` - - -## Advanced topics: How to switch between different Device/Library - -Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. - - -For more details, please refer to following docs: - -- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md) -- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md deleted file mode 100644 index 56203d6fad444f61ef1be187ad0d149b2aa99ba4..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/use_eigen_cn.md +++ /dev/null @@ -1,146 +0,0 @@ -# 在Paddle中如何使用Eigen - -神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 - - -## Eigen Tensor模块 - -Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 - -关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) - - -## paddle::framework::Tensor - -Paddle Tensor定义在framework目录下,其主要接口如下: - -```cpp -class Tensor { - public: - /*! Return a pointer to mutable memory block. */ - template - inline T* data(); - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(platform::Place place); - - /** - * @brief Return a pointer to mutable memory block. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(DDim dims, platform::Place place); - - /*! Resize the dimensions of the memory block. */ - inline Tensor& Resize(const DDim& dims); - - /*! Return the dimensions of the memory block. */ - inline const DDim& dims() const; - - private: - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /*! points to dimensions of memory block. */ - DDim dim_; -}; -``` - -`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。 - -```cpp -paddle::framework::Tensor t; -paddle::platform::CPUPlace place; -// set size first -t.Resize({2, 3}); -// allocate memory on CPU later -t.mutable_data(place); -``` - -### paddle::framework::Tensor使用样例 -下面以AddOp为例说明Tensor的使用过程: - -- InferShape - -在运行神经网络计算图时,我们先调用每个`Operator`的`InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。 - -```cpp -void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), - ctx.Input("Y")->dims(), - "Two input of Add Op's dimension must be same."); - ctx.Output("Out")->Resize(ctx.Input("X")->dims()); -} -``` - - -- Run - -`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。 - -```cpp -void Compute(const framework::ExecutionContext& context) const override { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Y"); - auto* output = context.Output("Out"); - - output->mutable_data(context.GetPlace()); - - auto x = EigenVector::Flatten(*input0); - auto y = EigenVector::Flatten(*input1); - auto z = EigenVector::Flatten(*output); - - auto place = context.GetEigenDevice(); - - z.device(place) = x + y; -} -``` - - -### paddle::framework::Tensor到EigenTensor的转换 - -如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 - -以EigenTensor为例,做一个介绍 - -```cpp -Tensor t; -float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); -for (int i = 0; i < 1 * 2 * 3; i++) { - p[i] = static_cast(i); -} - -EigenTensor::Type et = EigenTensor::From(t); -``` - -From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。 - -在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。 - -更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。 - - - -## 实现计算 - -当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 - -```cpp -auto x = EigenVector::Flatten(*input0); -auto y = EigenVector::Flatten(*input1); -auto z = EigenVector::Flatten(*output); -auto place = context.GetEigenDevice(); -z.device(place) = x + y; -``` - -在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。 - -由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。 diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md deleted file mode 100644 index 3313d097cb21e40c23aa13187b6a50562f12403a..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/use_eigen_en.md +++ /dev/null @@ -1,146 +0,0 @@ -# How to use Eigen in Paddle - -Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. - - -## Eigen Tensor Module - -The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. - -Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse. - -For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). - - -## paddle::framework::Tensor - -Paddle Tensor's is defined in the framework directory with the following interface: - -```cpp -class Tensor { - public: - /*! Return a pointer to mutable memory block. */ - template - inline T* data(); - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(platform::Place place); - - /** - * @brief Return a pointer to mutable memory block. - * - * @param[in] dims The dimensions of the memory block. - * @param[in] place The place of the memory block. - * - * @note If not exist, then allocation. - */ - template - inline T* mutable_data(DDim dims, platform::Place place); - - /*! Resize the dimensions of the memory block. */ - inline Tensor& Resize(const DDim& dims); - - /*! Return the dimensions of the memory block. */ - inline const DDim& dims() const; - - private: - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /*! points to dimensions of memory block. */ - DDim dim_; -}; -``` - -`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. - -```cpp -paddle::framework::Tensor t; -paddle::platform::CPUPlace place; -// set size first -t.Resize({2, 3}); -// allocate memory on CPU later -t.mutable_data(place); -``` - -### paddle::framework::Tensor Usage -`AddOp` demonstrates Tensor's usage. - -- InferShape - -When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor. - -```cpp -void InferShape(const framework::InferShapeContext &ctx) const override { - PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), - ctx.Input("Y")->dims(), - "Two input of Add Op's dimension must be same."); - ctx.Output("Out")->Resize(ctx.Input("X")->dims()); -} -``` - - -- Run - -```cpp -void Compute(const framework::ExecutionContext& context) const override { - auto* input0 = context.Input("X"); - auto* input1 = context.Input("Y"); - auto* output = context.Output("Out"); - - output->mutable_data(context.GetPlace()); - - auto x = EigenVector::Flatten(*input0); - auto y = EigenVector::Flatten(*input1); - auto z = EigenVector::Flatten(*output); - - auto place = context.GetEigenDevice(); - - z.device(place) = x + y; -} -``` - - -## paddle::framework::Tensor到EigenTensor的转换 - -As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. - -Using EigenTensor as an example: - -```cpp -Tensor t; -float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); -for (int i = 0; i < 1 * 2 * 3; i++) { - p[i] = static_cast(i); -} - -EigenTensor::Type et = EigenTensor::From(t); -``` - -`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation. - -In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. - -For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file. - - - -## Implementing Computation - -While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. - -```cpp -auto x = EigenVector::Flatten(*input0); -auto y = EigenVector::Flatten(*input1); -auto z = EigenVector::Flatten(*output); -auto place = context.GetEigenDevice(); -z.device(place) = x + y; -``` - -In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface. - -Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels). diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst deleted file mode 120000 index 2c281eaaf43bbfad84c3be9ed1d1bd0dbc77fa9b..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/write_docs_cn.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/dev/write_docs_cn.rst \ No newline at end of file diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst deleted file mode 120000 index cb2b9b0ff1f1d9e0e5201d160f6b7d9d451374e2..0000000000000000000000000000000000000000 --- a/doc/fluid/dev/write_docs_en.rst +++ /dev/null @@ -1 +0,0 @@ -../../v2/dev/write_docs_en.rst \ No newline at end of file diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst deleted file mode 100644 index 395c1109891b5a00eab6f0b44d855658def7fdd6..0000000000000000000000000000000000000000 --- a/doc/fluid/faq/index_cn.rst +++ /dev/null @@ -1,2 +0,0 @@ -FAQ ------------- diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst deleted file mode 100644 index 395c1109891b5a00eab6f0b44d855658def7fdd6..0000000000000000000000000000000000000000 --- a/doc/fluid/faq/index_en.rst +++ /dev/null @@ -1,2 +0,0 @@ -FAQ ------------- diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md deleted file mode 100644 index 79df6c59578e2acf495a3453ab61f069c3f09a49..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md +++ /dev/null @@ -1,1819 +0,0 @@ - -# Paddle Fluid 开发者指南 - ---- - -### ==1==. 为什么需要 PaddlePaddle Fluid? - ---- - -### 两个基础问题 - - - -1. 如何描述机器学习模型和优化过程? - - 完备自洽,表达能力足以支持潜在出现的各种计算需求 -1. 如何充分利用资源高效计算? - - 支持异步设备、多卡、分布式计算 - - 降低计算/计算优化的开发成本 - - …… - - - ---- - -### 如何描述模型和优化过程? - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
一组连续执行的layersvariable和operator构成的计算图 不再有模型的概念
2013 Caffe,Theano, Torch, PaddlePaddle
2015 TensorFlow, MxNet, Caffe2, ONNX, n-graph
2016 PyTorch, TensorFlow Eager Execution, **==PaddlePaddle Fluid==**
- ---- - - -###

目标

- - - -- 提高对各类机器学习任务的描述能力:能够描述潜在出现的任意机器学习模型。 -- 代码结构逻辑清晰,各模块充分解耦:内外部贡献者能够专注于自己所需的功能模块,基于框架进行再次开发。 -- 从设计上,留下技术优化的空间和潜力。 -- 代码解耦后降低多设备支持、计算优化等的开发成本。 -- 在统一的设计理念下,实现自动可伸缩,自动容错的分布式计算。 - - - ---- - -## ==2.== Design Overview - ---- - -# Fluid: 系统形态 - -- [编译器式的执行流程,区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md) -
- -

- -

- ---- - -#### 让我们在Fluid程序实例中,区分编译时和运行时 - ---- -### Fluid 编译时 - - - -- ==**定义前向计算**== - - ```python - x = fluid.layers.data(name='x',shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(x=cost) - ``` - -- ==**添加反向、正则、优化**== - ```python - learning_rate = 0.01 - sgd_optimizer = fluid.optimizer.SGD(learning_rate) - sgd_optimizer.minimize(avg_cost) - ``` - - ---- - -### `Program` vs. 计算图 - - - -- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程: -- -

- -

- - -- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。 -- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。 - -
- ---- - -### Fluid 运行时 - - - -- ==**读入数据**== - - ```python - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=20) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - ``` -- ==**定义执行程序的设备**== - ```python - place = fluid.CPUPlace() - feeder = fluid.DataFeeder(place=place,feed_list=[x, y]) - ``` - -- ==创建执行器(Executor),执行初始化 `Program`和训练`Program`== - - ```python - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - PASS_NUM = 100 - for pass_id in range(PASS_NUM): - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - print(avg_loss_value) - ``` - - ---- - -### 总结:框架做什么?用户做什么? -
- - - - - - - - - - - - - - - - -
构建训练执行训练
-用户:描述前向运算
框架:添加反向运算
框架:添加优化运算
框架:添加内存优化
框架:添加并行/多设备/分布式相关的计算单元 -
-框架:创建Operator(计算)+ Variable(数据)
框架:创建`Block`
框架:内存管理/设备管理
框架:执行计算 -
-
- ---- - -###

总结:编译时

- - -**用户编写一段Python程序,描述模型的前向计算** -1. 创建变量描述 `VarDesc` -1. 创建operators的描述 `OpDesc` -1. 创建operators的属性 -1. 推断变量的类型和形状,进行静态检查:`inferShape` -1. 规划变量的内存复用 -1. 创建反向计算 -1. 添加优化相关的Operators -1. (可选)添加多卡/多机相关的Operator,生成在多卡/多机上运行的程序 - - - ---- - -###

总结:运行时

- - -**执行规划好的计算** -1. 创建`Executor` -1. 为将要执行的一段计算,在层级式的`Scope`空间中创建`Scope` -1. 创建`Block`,依次执行`Block` - -

-
- Figure. 编译时运行时概览 -

- -
- ---- - -## ==3==. 用户如何描述计算? ---- - -### Fluid:==像写程序一样==定义计算 - - -- 顺序执行 - ```python - x = fluid.layers.data(name='x',shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - ``` - -- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) - - ```python - a = fluid.Var(10) - b = fluid.Var(0) - - switch = fluid.switch() - with switch.block(): - with switch.case(fluid.less_equal(a, 10)): - fluid.print("Case 1") - with switch.case(fluid.larger(a, 0)): - fluid.print("Case 2") - with switch.default(): - fluid.print("Case 3") - ``` - ->[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html). - - - ---- - -### Fluid: ==像写程序一样==定义计算 - - - -- 循环:[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105) - - ```python - d0 = layers.data("d0", shape=[10], dtype='float32') - data_array = layers.array_write(x=d0, i=i) - array_len = layers.fill_constant(shape=[1],dtype='int64', value=3) - - cond = layers.less_than(x=i, y=array_len) - while_op = layers.While(cond=cond) - with while_op.block(): - d = layers.array_read(array=data_array, i=i) - i = layers.increment(x=i, in_place=True) - layers.array_write(result, i=i, array=d) - layers.less_than(x=i, y=array_len, cond=cond) - ``` - -- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44) -- beam search [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105) - - - ---- - -####

总结

- - - -1. 用户层提供的描述语法具有完备性、自洽性,有能力支持对复杂计算过程描述 -1. 使用方式和核心概念可以类比编程语言,认知能够直接迁移 -1. 能够支持:定义问题,逐步求解 - - - ---- - -## ==3.== 核心概念 - ---- -### 编译时概念 :==变量和计算的描述== - - - -- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc` - - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto - -- 什么是 Fluid Program - - - 在Fluid中,一个神经网络任务(训练/预测)被描述为一段`Program` - - `Program`包含对`Variable`(数据)和 `Operator`(对数据的操作)的描述 - - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`,构成一段完整的`Fluid Program` - - ->编译阶段最终,经过 Transpiler 的执行规划,变换处理,生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行 - - - ---- - -### 编译时概念 :==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**== - - -1. 接受一段`ProgramDesc`作为输入,生成一段新的`ProgramDesc` - - - *Memory optimization transpiler*:向原始`ProgramDesc` 中插入 `FreeMemoryOps`,在一次迭代优化结束前提前释放内存,使得能够维持较小的 memory footprint - - - *Distributed training transpiler*:将原始的`ProgramDesc`中转化为对应的分布式版本,生成两段新的`ProgramDesc`: - 1. trainer进程执行的`ProgramDesc` - 1. parameter server执行的`ProgramDesc` - -1. ==**WIP**==: 接受一段`ProgramDesc`,生成可直接被`gcc`, `nvcc`, `icc`等编译的代码,编译后得到可执行文件 - - - ---- -### Transplier - -

- -

- ---- - -### 打印 `ProgramDesc` - -

- -

- - - -- `default_startup_program`:创建可学习参数,对参数进行初始化 -- `default_main_program`:由用户定义的模型,包括了前向、反向、优化及所有必要的计算 - -- 打印可读的 `Program` - ```python - from paddle.v2.fluid import debuger - print debuger.pprint_program_codes(framework.default_main_program().desc) - ``` - - ---- -### 输出效果 - - - - - - - - - - - - - - -
variable in block 0variable in block 0
-
- ---- - -### 运行时概念 - - - -- 数据相关 - - `Tensor` / `LoDTensor` / `Variable` - - `Scope` - -- 计算相关 - - `Block` - - `Kernel`、`OpWithKernel`、`OpWithoutKernel` - - - - - - - - - - - - - - - - - - - - - - - - - - - -
protobuf messagesC++ class objects
Data[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107) -[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) -
Operation[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35) -[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64) -
BlockBlockDesc -Block -
- -- 执行相关 :`Executor` - -
- ---- -#### Tensor 和 LoD(Level-of-Detail) Tensor - - -- Tensor 是$n$-dimensional arry的推广,LoDTensor是在Tensor基础上附加了序列信息 -- Fluid中输入、输出,网络中的可学习参数全部统一使用LoDTensor(n-dimension array)表示 -- 一个mini-batch输入数据是一个LoDTensor - - 在Fluid中,RNN 处理变长序列无需padding,得益于 `LoDTensor`表示 - - 可以简单将 LoD 理解为:`std::vector>` - - 对非序列数据,LoD 信息为空 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TensorFlowPaddlePaddle
RNNSupport -Support -
recursive RNNSupport -Support -
padding zerosMust -No need -
blob data typeTensor -LODTensor -
- -
- ---- -#### LoD 信息实例 - - - -

- -

- -- 图(a)的LoD 信息 - ```cpp - [0, 5, 8, 10, 14] - ``` -- 图(b)的 LoD 信息 - ```cpp - [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/] - ``` -
- ---- -#### Tensor, Variable, Scope 之间的关系 - -

- -

- - -1. `Block` 是一个实现层的概念,不在应用层暴露给用户。目前用户无法自行创建并利用`Block`,用户能够感知的只有`Program`这个概念。 -1. 逻辑上,可以将 `Block` 类比为编程语言中的大括号:定义了一段作用域,其中运行一段代码 -1. `Executor`会为每一个`Block`创建一个`Scope`,`Block`是可嵌套的,因此`Scope`也是可嵌套的 - - - ---- -### Executor - - - - - - - - - - - - - - -
接口说明

- -

输入
1. `ProgramDesc`
2. `Scope`
3.`block_id`

解释执行步骤
1. 创建所有 Variables
2. 逐一创建 Operator 并运行 -
- ---- -### Operator/OpWithKernel/Kernel - - -

- -

- -- operator 无状态,Operator的核心是==Run==方法 -- 一个operator可以注册多个kernel -- operator 可以无 kernel:while_op 、ifelse op - -
- ---- -#### Fluid Operator vs. PaddlePaddle layers - - - - - - - - - - - - - - - - - - -
LayerOperator

- -

- -

1. 内部维护状态
2. 包含forward和backward方法
1. 内部无状态
2. 只有Run方法
- -
- ---- - -### ==4.== 内存管理 - ---- -### 目标 - -- 为异构设备提供统一的内存分配、回收接口 -- 最小化管理内存所需的时间,最小化管理开销 -- 减少内存碎片 -- 将内存管理与计算(Operators/Kernels)完全剥离 -- 统一内存管理是内存优化的基础 - ---- - - - -### Memory 接口 - -- 内存管理模块向上层应用逻辑提供三个基础接口: - ```cpp - template - void* Alloc(Place place, size_t size); - - template - void Free(Place place, void* ptr); - - template - size_t Used(Place place); - - struct Usage : public boost::static_visitor { - size_t operator()(const platform::CPUPlace& cpu) const; - size_t operator()(const platform::CUDAPlace& gpu) const; - }; - ``` -- 模板参数 `Place` 指示内存分配发生的设备 -- 实现时,需特化支持的 `Place`, 提供以上三个接口的实现 - - - ---- -### 代码结构 - - - -内存管理模块可以理解为由以下两部分构成: - -1. SystemAllocator:实际从物理设备上分配、释放的内存的接口 -1. BuddyAllocator:内存管理算法 - - - ---- -### System Allocator - - - -- SystemAllocator 是实现物理内存分配、回收的基类 - - 不同设备上的内存分配和回收终将转化为标准接口调用 - - 为不同设备实现MemoryAllocator,继承自SystemAllocator - - ```cpp - class SystemAllocator { - public: - virtual ~SystemAllocator() {} - virtual void* Alloc(size_t& index, size_t size) = 0; - virtual void Free(void* p, size_t size, size_t index) = 0; - virtual bool UseGpu() const = 0; - }; - ``` - - ---- - -### CPU/GPU Allocator - - - -```cpp -class CPUAllocator : public SystemAllocator { - public: - virtual void* Alloc(size_t& index, size_t size); - virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu() const; -}; - -#ifdef PADDLE_WITH_CUDA -class GPUAllocator : public SystemAllocator { - public: - virtual void* Alloc(size_t& index, size_t size); - virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu() const; - private: - size_t gpu_alloc_size_ = 0; - size_t fallback_alloc_size_ = 0; -}; -#endif -``` -- CPUAllocator和GPUAllocator分别继承自SystemAllocator,分别调用相应的标准库函数实现物理内存的分配和释放。 -- 一旦大块、连续的物理内存分配之后,将通过内存管理算法实现内存的按块分配、回收、重用等。 - - - ---- -### CPU Allocator - - - -- CPU 内存的分配提供两种选项: - 1. non-pinned memory:可分页内存 - 2. pinned memory:页锁定内存 - - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少,影响系统性能,默认CPU下分配的是可分页内存 - -- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。 - - ```cpp - DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); - DEFINE_double(fraction_of_cpu_memory_to_use, 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); - ``` - - - ---- -### GPU Allocator - - - -- 通过 cudaMalloc 分配GPU显存 -- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存 - - 如果可用显存小于请求分配大小,调用cudaMalloc进行分配 - - 如果可用显存不足,目前会报错退出。 -- 通过gflags控制GPU下一次性分配显存的大小: - - ```cpp - DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, - "Default use 92% of GPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); - ``` - - - ---- -#### 内存管理算法: [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation) - - - -- Memory Arena:一次性分配大块连续内存,之后会基于这块内存进行内存管理:动态分配、释放、重用内存块。 -- 伙伴内存分配: - - 将内存划分为 2 的幂次方个分区,使用 best-fit 方法来分配内存请求。 - - 当释放内存时,检查 buddy 块,查看相邻的内存块是否也已被释放。如果是,将内存块合并,以最小化内存碎片。 - - 分配的内存在物理内存的自然边界对齐,提高内存访问效率。 - - 算法的时间效率高,单使用 best-fit 方法的缘故,会产生一定的内存浪费 - - - ---- - -### Buddy Allocator - - - -- BuddyAllocator 是一个单例,每个设备(如: GPU/CPU(0)/GPU(1)) 拥有一个BuddyAllocator -- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator -- 当请求的内存超过BuddyAllocator管理的空余内存时,将会调用SystemAllocator去指定的设备上分配物理内存 - - - ---- -### 实例:CPU 下内存管理接口的实现 - - - -- 对上层应用,统一通过BuddyAllocator来实现内存的分配、释放以及用量查询 - ```cpp - template <> - void* Alloc(platform::CPUPlace place, size_t size) { - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); - void* p = GetCPUBuddyAllocator()->Alloc(size); - VLOG(10) << " pointer=" << p; - return p; - } - - template <> - void Free(platform::CPUPlace place, void* p) { - VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - GetCPUBuddyAllocator()->Free(p); - } - - template <> - size_t Used(platform::CPUPlace place) { - return GetCPUBuddyAllocator()->Used(); - } - ``` - - ---- -### ==5.== 多设备支持 - ---- -### 多设备支持(一) - - - -- step 1:添加Place类型,由用户实现添加到框架 - - 可以将Place类型理解为一个整数加上一个枚举型,包括:设备号 + 设备类型 - -

- -

-- DeviceContext - - 不同的Place会对应一个相应的DeviceContext,用于组织管理与设备相关的信息 - - 例如,GpuDeviceContext中会管理Cuda stream - - 目前实现中一些特殊的库也会对应有自己的DeviceContext:例如: - ```cpp - class MKLDNNDeviceContext : public CPUDeviceContext {……} - ``` - - 每种设备对应的DeviceContext需要管理的内容不尽相同,视具体需求来实现 - -
- ---- - -### 多设备支持(二) - - - -- step 2: 增加KernelType,为相应的KernelType注册Kernel对象,由用户实现注册给框架 可以按照: - 1. Place 执行设备 - 1. DataType 执行数据类型 FP32/FP64/INT32/INT64 - 1. Memory layout: 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC - 1. 使用的库 - - 来区分Kernel,为同一个operator注册多个 Kernel。 - - ```cpp - struct OpKernelType { - proto::DataType data_type_; - DataLayout data_layout_; - platform::Place place_; - LibraryType library_type_; - } - ``` - - - ---- - -### 多设备支持(三) - - - -step 3: 运行时的 KernelType 推断和Kernel切换,按需要修改Kernel推断和Kernel切换规则 -- Expected Kernel:期待调用的Kernel:由(1)`Place`和计算精度决定;或(2)用户在配置中显示指定使用的计算库,如`cudnn`、`mkldnn`等。 -- Actual Kernel:运行时从`Operator`的输入(`Variable`)可以推断出实际需要的`KernelType` -- 当Expected Kernel和Actual Kernel不一致的时候,框架会插入`data_transformer`或者`data_layerout_transform`等,保证Expected Kernel可以执行,包括: - - CPUPlace -> GPUPlace :跨设备内存复制 - - NCHW -> nChw8c :Layout转换 - - FP32 -> FP16 :精度转换 _**尚未支持**_ - - …… -- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497) - - - ---- -## ==6.== while_op - ---- -### while_op - - - -- 循环执行一段`Program`,直到条件operator判断循环条件不满足时终止循环 -- while_op 的特殊之处: - 1. while_op 没有 kernel - 1. while_op 拥有自己的`Block`,会形成一段嵌套的`Block` - 1. ==while_op 内部创建了一个 Executor,来循环执行`Block`== - -- while_op 输入输出 : LoDTensorArray - ```cpp - namespace paddle { - namespace framework { - using LoDTensorArray = std::vector; - } - } - ``` - - 每一次循环,从原始输入中“切出”一个片段 - - LoDTensorArray 在Python端暴露,是Fluid支持的基础数据结构之一,用户可以直接创建并使用 - - - ---- -### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览 - - - -```cpp - -void Run(const framework::Scope &scope, - const platform::Place &dev_place) const override { - PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); - auto &cond = scope.FindVar(Input(kCondition))->Get(); - PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); - - framework::Executor executor(dev_place); - auto *block = Attr(kStepBlock); - - auto *program = block->Program(); - auto step_scopes = - scope.FindVar(Output(kStepScopes))->GetMutable(); - - while (cond.data()[0]) { - auto ¤t_scope = scope.NewScope(); - step_scopes->push_back(¤t_scope); - executor.Run(*program, ¤t_scope, block->ID(), - false /*create_local_scope*/); - } -} - -``` - - - ---- -### while_op 的重要应用:Dynamic RNN - ---- - -### 什么是 `dynamicRNN` ? - - -
- -1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据,在其上循环调用用户定义的单步计算 -1. 可学习参数在多个时间步之间共享 -1. `dynamicRNN` 由 `while_op` 实现 -1. 如果`dynamicRNN`中定义了`memory`,将会构成一个循环神经网络,否则其行为就等于在输入序列上循环调用预定义的单步计算 - -
- ---- - -#### `dynamic RNN` 用户接口 - - -

- -

- -- `dynamicRNN` 中的重要元素 - 1. **step input**: `dynamicRNN` 每个时间步的输入 - 1. **step function**: 用户定义的单步计算 - 1. **memory**: 用于形成循环连接 - 1. **external/static memory**:单步计算的每一步都可以全部读取到的外部输入 - -
- ---- - -#### dynamicRNN 中的 Memory - - - -`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量 - - `memory` “指向” 一个operator的输出变量,记作: A - - `memory` 可以被 LoDTensor 初始化(当LoD信息为空时,为非序列,否则为序列),默认`memory`被初始化为零 - - `memory` 在 operator A 前向计算之后,进行前向计算 - - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor - - `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接 - - - ---- - -### DynamicRNN 实现细节 - - - -- `while_op` 无法独立构成dynamicRNN,必须和一组相关的 operator 及数据结构配合 - - 依赖的 operators (这里仅列出最重要的,并非全部): - - `lod_rank_table` operator - - `lod_tensor_to_array` operator - - `array_to_lod_tensor` operator - - `shrink_memory` operator - - 依赖的数据结构 - - `TensorArray` - - `LoDRankTable` - -- 在Fluid中,RNN接受变长序列输入,无需填充,以上数据结构和相关的operator配合工作,实现了对变长输入以batch计算 - - - ---- - -### `dynamicRNN` 如何实现 batch 计算 ? - - - -- 问题: - - RNN 可以看作是一个展开的前向网络,前向网络的深度是最长序列的长度 - - 如果不对变长序列进行填充,将它们填充到一样长度,每个mini-batch输入将会不等长,每个样本展开长度不一致,导致前向和反向计算实现困难 - - - ----- -##### 实例 :RNN encoder-decoder with attention - - - -- 以机器翻译的RNN encoder-decoder 模型(涉及了`dynamicRNN`的所有设计要素)为例,下图是 RNN encoder-decoder 的原始输入: -

-
Figure. RNN encoder-decoder 原始batch 输入数据 -

- -- source word sequences 是encoder RNN的输出,是一个LoDTensor -- target word sequences 是look_uptable的输入,是一个LoDTensor -- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间,表示一个dense vector - -
- ---- - -### `dynamicRNN` 如何实现 batch 计算 ? - - - -1. 对一个mini batch中不等长样本进行排序,最长样本变成batch中的第一个,最短样本是batch中最后一个 - - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator` - - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index - -2. 构建每个时间步的batch输入:随着时间步增加,每个时间步的batch输入可能会逐渐缩小 - - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD) -3. 每个时间步输出写入一个输出 `LoDTensorArray` -3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序,还原会原始输入顺序 - - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor` - - - ---- - -### 运行实例 - -

- -

- ---- -### 运行实例 - -

- -

- - - -- 执行到第5~7个batch时,batch size将会缩小 - - - ---- -### 运行实例 - -

- -

- - - -- 第5 ~ 7个batch时RNN的`memory`会发生什么? - - `memory` 指向某个operator的输出Tensor,在该operator前向计算之后,“取回”其计算结果 - - 5 ~ 7时,遇到了序列的结束,==下一个时间步计算不再需要在已经结束的序列上展开== - - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入 - - - ---- -### 运行实例:batch 1 ~ 2 - -

-
Figure. 第1、2个batch输入dynamicRNN的batch输入 -

- ---- -### 运行实例:batch 3 ~ 4 - -

-
Figure. 第3、4个batch输入dynamicRNN的batch输入 -

- ---- - -### 运行实例:batch 5 ~ 7 - -

-
Figure. 第5、6、7个batch输入dynamicRNN的batch输入 -

- ---- -### ==7.== Fluid 代码结构 - ---- -### Fluid 代码结构 - - - - - - - - - - - - - - - -
代码结构模块结构
-

- -

-
-

- -

-
- ---- - -### ==8.== 文档总结 - ---- - - -- 设计概览 - - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md) - - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md) - - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md) -- 核心概念 - - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md) - - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md) - - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) - - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) - - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md) - - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) - - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) - ---- - -- 重要功能模块 - - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md) - - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md) - - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md) - - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md) - - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md) - -- 开发指南 - - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md) - - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md) - - 添加新的Kernel [->]( -https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md) - - - ---- - -### ==9.== 开发指南 - ---- - -#### 建议开发环境:使用 Docker 编译和测试 - - - -Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html) - -PaddlePaddle 在 Dockerhub 地址:[->]( - https://hub.docker.com/r/paddlepaddle/paddle/tags/) - -1. 获取PaddlePaddle的Docker镜像 - ```bash - docker pull paddlepaddle/paddle:latest-dev - ``` - -1. 启动 docker container - - ```bash - docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash - ``` - -1. 进入docker container后,从源码编译,请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html) - - - ---- - -### 一些说明 - - - -1. PaddlePaddle的Docker镜像为了减小体积,默认没有安装vim,可以在容器中执行`apt-get install -y vim`来安装vim。 -1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像,主要用于运行PaddlePaddle程序。 -2. 在Docker中运行GPU程序,推荐使用nvidia-docker,[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。 - - - ```bash - nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash - ``` - - - - - ---- - -### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html) - - - -- ==提交PullRequest前请务必阅读==: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html) -- 代码要求 - 1. 代码注释遵守 Doxygen 的样式 - 1. 确保编译器选项 WITH_STYLE_CHECK 已打开,并且编译能通过代码样式检查 - 1. 所有代码必须具有单元测试,且能够通过所有单元测试 -- 使用 `pre-commit` 钩子提交Pull Request - 1. 帮助格式化源代码(C++,Python) - 1. 在提交前自动检查一些基本事宜:如每个文件只有一个 EOL,Git 中不要添加大文件等 - 1. 安装pre-commit,并在PaddlePaddle根目录运行: - ```bash - ➜ pip install pre-commit - ➜ pre-commit install - ``` - - ---- - -### 如何贡献 - - - -1. 开始开发之前请先建立issue。 - - 让其它同学知道某项工作已经有人在进行,以避免多人开发同一功能的情况。 -1. 提交PR必须关联相关的issue。做法请参考:[->](https://help.github.com/articles/closing-issues-using-keywords/) - - 目的:为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能,为了解决什么样的问题。 - - 当PR被merge后,关联的issue会被自动关闭。 -1. PR review 中,reviewer的每条comment都必须回复。 - - 如修改完可直接回复:Done。 - - 目的:review comment 中可能会有(1)询问类型的问题;(2)可以在下一个PR修改的问题;(3)comment意见不合理等。需要明确回复,以便reviewer和其他人有历史可查,便于区分是否已经进行修改,或者准备下一个PR修改,或者意见不合理可以不用进行修改。 - - - ---- - -### ==10.== 添加新的 Operator - ---- - -### 概念简介 - - - -添加一个新的operator,会涉及实现以下C++类的派生类: - -1. `framework::OperatorBase`: Operator(简写,Op)基类。 -1. `framework::OpKernel`: Op计算函数的基类,称作Kernel。 -1. `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。 -1. `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成 - -依据是否包含kernel,可以将Op分为两种: -1. 包含Kernel的Op:继承自OperatorWithKernel,==绝大多数operator都属于这一类== -1. 不包含kernel的Op,继承自OperatorBase,只有少量Op属于这一类,例如while_op,ifelse_op - -这里主要介绍带Kernel的Op如何编写。 - - - ---- - -#### 添加新的Operator需要修改/添加哪些文件? - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
内容定义位置
-OpProtoMake定义 - -`.cc`文件,Backward Op不需要OpProtoMaker -
-Op定义 - -`.cc`文件 -
-Kernel实现 - -CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 -
-注册Op - -Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 -
- -- 添加 Operator 之前请阅读:[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。 -- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。 -- 根据文件名自动构建op和Python端绑定,请务必遵守以上命名,否则需要进一步修改PyBind相关文件及CMakeLists.txt。 -
- ---- - -###### 实现带Kernel的Operator step1: 定义ProtoMaker类 - - - -下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍 - -- clip_op计算公式:$Out = \min(\max(X, min), max)$ -- 首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释(*下面代码段的中注释进行了简化,实现时需按照规范添加注释*): - - ```cpp - template - class ClipOpMaker : public framework::OpProtoAndCheckerMaker { - public: - ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X","(Tensor)The input of clip op."); - AddOutput("Out", "(Tensor),The output of clip op."); - AddAttr( - "min", "(float),Minimum value."); - AddAttr( - "max", "(float),Maximum value."); - AddComment(R"DOC( - …… - )DOC"); - } - }; - ``` - - - ---- - -###### 实现带Kernel的Operator step2: 定义Operator类 - - - -下面的代码段实现了`clip_op`的定义: - -```cpp -class ClipOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ClipOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ClipOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - auto max = ctx->Attrs().Get("max"); - auto min = ctx->Attrs().Get("min"); - PADDLE_ENFORCE_LT(min, max, "max should be greater than min."); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } -}; -``` - - ---- - -### Operator 类中需要完成的工作 - - - -1. clip_op 继承自`OperatorWithKernel`, - - ```cpp - using framework::OperatorWithKernel::OperatorWithKernel; - ``` - 表示使用基类`OperatorWithKernel`的构造函数。 - -1. 重写`InferShape`接口。 - - `InferShape` 为const函数,不能修改Op的成员变 - - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`,从中可获取到输入输出以及属性 - - `InferShape` 会被调用两次,一次是编译时(创建op),一次是运行时(调用op的`Run`方法时),需要完成以下功能: - 1. 做检查, 尽早报错:检查输入数据维度、类型等是否合法 - 2. 设置输出Tensor的形状 - -通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。 - - - ---- - -### 补充说明 - - - -1. `InferShape`目前支持两种实现方式,二者最后都会生成一个functor注册给OpInfo结构体。 - 1. 继承framework::InferShapeBase,实现为一个functor(参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)) - 2. override InferShape函数(参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)) - -1. 什么是`functor` ? - - - 类或结构体仅重载了`()`,一般是可被多个kernel复用的计算函数。 - - - - ```cpp - template - class CrossEntropyFunctor { - public: - void operator()(const platform::CPUDeviceContext& ctx, - framework::Tensor* out, - const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel) { - …… - } - }; - ``` - - - - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法: [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。 - - - ---- - -###### 实现带Kernel的Operator step3: 定义OpKernel类 - - - -- `ClipKernel`继承自`framework::OpKernel`,带有下面两个模板参数: - 1. `typename DeviceContext`: 表示设备类型,不同设备共享同一个Kernel时,需添加该模板参数。不共享时,需要提供针对不同设备的特化实现。 - 1. `typename T` : 表示支持的数据类型,如`float`, `double`等 - -- 在`ClipKernel`类中重写`Compute`方法 - 1. `Compute`接受输入参数:`const framework::ExecutionContext& context` - - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起,使得Op在调用`Compute`方法时,能够简单地通过名字拿到需要的输入输出`Variable` - - 与`InferShapeContext`相比,`ExecutionContext` 中增加了设备类型 - 1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑 - - - ---- -#### ClipKernel 代码概览 - - - -```cpp -template -class ClipKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto max = context.Attr("max"); - auto min = context.Attr("min"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - T* out_data = out->mutable_data(context.GetPlace()); - const T* x_data = x->data(); - int64_t numel = x->numel(); - Transform trans; - trans(context.template device_context(), x_data, - x_data + numel, out_data, ClipFunctor(min, max)); - } -}; -``` - -- 为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用, Fluid 使用 Eigen 作为基础的矩阵运算库 -- Fluid对Eigen unsupported Tensor提供了一些基本的封装,可以在`Compute`接口中直接调用 - - 关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。 - - - ---- -###### 实现带Kernel的Operator step4: 实现反向Op - - - -- ==**反向Op没有`ProtoMaker`**==,除此之外定义与实现方式前向Op完全一致,不再赘述 -- 这里仅对反向Op的输入输出进行说明: - 1. 反向Op的输入 - - 前向Op的输出 - - 反向传播过程中传递给当前Op的梯度 - - 需要注意,Fluid中,不区分Cost Op和中间层Op,所有Op都必须正确处理接收到的梯度 - 2. 反向Op的输出 - - 对可学习参数的求导结果 - - 对所有输入的求导结果 - - - - ---- - -###### 实现带Kernel的Operator step5: 注册Op及Kernel - - - -至此Op和Op kernel都已经实现完毕,接下来,需要在`.cc`和`cu`文件中注册op和kernel - -1. 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 - - - - ```cpp - namespace ops = paddle::operators; - REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, - ops::ClipOpGrad); - REGISTER_OP_CPU_KERNEL( - clip, ops::ClipKernel); - REGISTER_OP_CPU_KERNEL( - clip_grad, ops::ClipGradKernel); - ``` - - - 在上面的代码片段中: - - 1. `REGISTER_OP` : 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad` - 1. `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op,例如:优化算法相关的Op - 1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类 - - -1. 按照同样方法,在`.cu`文件中注册GPU Kernel - - 如果CUDA Kernel的实现基于Eigen,需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` - - - ---- - -##### 编译和Python端绑定 - - - -- 运行下面命令可以仅编译新添加的Op: - - ``` - make mul_op - ``` - - 需注意,运行单元测试需要编译整个工程 - -- 如果遵循前文的文件命名规则,构建过程中,会自动为新增的op添加Python端绑定,并链接到生成的lib库中 - - - ---- - -###### 实现带Kernel的Operator step6: 添加前向单测及梯度检测 - - - -- 新增Op的单元测试统一添加至:[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录 -- 前向Operator单测 - - 1. Op单元测试继承自`OpTest`,各项具体的单元测试在`TestClipOp`里完成,所有单测case都以`TestXX`命名 - 1. 单元测试Operator,需要: - 1. 在`setUp`函数定义输入、输出,以及相关的属性参数 - 1. 生成随机的输入数据 - 1. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比 - 1. 反向梯度检测流程测试框架已经实现,直接调用相应接口`check_grad`即可 - -- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py),这里不再展开 - - - ---- -#### 编译执行单测 - - - -- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译 - - - 运行单元测试测时需要编译整个工程,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON` -- 编译成功后,执行下面的命令来运行单元测试: - - ```bash - make test ARGS="-R test_mul_op -V" - ``` - - 或者: - - ``` - ctest -R test_mul_op - ``` - - ---- - -### 添加Op的一些注意事项 - - - -- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,将会导致编译出错。 -- 注册Op时的类型名,需要和该Op的名字一样。不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`,会导致单元测试出错。 -- 如果Op没有实现CUDA Kernel,不要创建空的`*_op.cu`,会导致单元测试出错。 -- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 - - - ---- - -### ==10.== 使用相关问题 - ---- - -### 定义前向计算 - - - -- 当在python端执行时: - ```python - import paddle.v2.fluid as fluid - ``` - [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`: - ```python - # program is a global instance. - _main_program_ = Program() - _startup_program_ = Program() - ``` - -- 前向定义的过程就是不断往`mian_program`中添加Op和Variable -- 如果需要执行一个新的`mian_program`时,可以调用调用: - ```python - def switch_main_program(program): - """ - Switch the main program to a new program. - This funtion returns the previous main program. - """ - …… - ``` - - ---- - -### 自定义参数的初始化 - - - -- 调用`fluid.ParamAttr(……)`接口,自定义参数的初始化 - - ```python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), - learning_rate=1.0, - regularizer=L1Decay(1.0), - trainable=True, - clip=GradientClipByValue(-1.0, 1.0), - ) - y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) - ``` - -- 补充问题:如何创建 `Variable` - ```python - cur_program = Program() - cur_block = cur_program.current_block() - new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32") - ``` - - - ---- - -### 添加反向Op - - - -- 调用`fluid.backward.append_backward(X)`(`X`是一个Variable),来为一段前向`ProgramDesc`添加反Op - - ```python - data = fluid.layers.data(name="data", shape=(2,3,4)) - out = fluid.layers.fc(input=data,size=128,act=None) - loss = fluid.layers.reduce_sum(out) - fluid.backward.append_backward(loss=loss) - ``` - -- 添加优化相关的Op - ```python - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(loss) - ``` - -- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program` - -- 当构建完成整个`Program`后,调用下面的接口执行内存优化: - ```python - fluid.memory_optimize(fluid.default_main_program()) - ``` - - _注:内存优化目前仍在持续开发中,有可能不够稳定。_ - - - ---- - -### 总结:编译时执行流程 - - - -- 用户定义前向计算 -- 添加反向Op到`default_main_program` -- 添加 gradient clipping Op 到 -- 添加 regularization Op 到`default_main_program` -- 为指定的优化算法,添加相关的状态 variable of optimizer 到`default_startup_program` - - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等 -- 添加初始化 variable 的Op 到 `default_startup_program` -- 为整个网络最后一个op,添加设置其接受到的梯度的Op到`default_main_program` -- 进行内存优化规划 - - - ---- - -### Feed 数据 (一):通过 feed 字典 - - - -- 执行executor的run方法时,指定feed字典,feed op 会将指定的数据放到`x`和`y`两个Variable中 - ```python - y_data = np.random.randint(0, 8, [1]).astype("int32") - y_tensor = core.Tensor() - y_tensor.set(y_data, place) - - x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32") - x_tensor = core.Tensor() - x_tensor.set(x_data, place) - …… - cost = exe.run( - fluid.default_main_program(), - feed={'x': x_tensor, - 'y': y_tensor}, - fetchlist=[avg_cost]) - ``` - -- 这种方法较为底层,一般用于单测中 - - - ---- - -### Feed 数据 (二):使用 DataFeeder接口 - - - -- 编写一个data_reader函数,data_reader是一个Python generator - - ```python - def demo_reader(): - def random_generator(): - yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1]) - return random_generator - ``` -- 在训练任务中使用 DataFeeder 接口 - ```python - cost = exe.run( - fluid.default_main_program(), - feed={'x': x_tensor, - 'y': y_tensor}, - fetchlist=[avg_cost]) - - train_reader = paddle.batch( - paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - for data in train_reader(): - cost = exe.run( - fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[cost]) - ``` - - - ---- - -### 常见问题 - - - -- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168) - - ```python - accuracy = fluid.evaluator.Accuracy(input=predict, label=label) - for pass_id in range(PASS_NUM): - accuracy.reset() - for data in train_reader(): - loss, acc = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost] + accuracy.metrics) - pass_acc = accuracy.eval(exe) - # acc 当前一个batch 的 accuracy - # pass_acc 当前batch 的 accuracy - pass_total_acc = accuracy.eval(exe) # 整个pass的accuracy - ``` - -- 如何在训练中测试?[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144) -- 如何保存训练好的模型?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143) -- 如何加载训练好的模型进行预测?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154) -- 如何在同一个训练任务中定义多个Program,并交替运行? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py) -- 如何profile?Fluid 实现了profile 工具,可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py) - - - - ---- diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst deleted file mode 100644 index 2e7f70fc4cb871a80ffaffec6c06797973cd2f85..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/concepts/index_cn.rst +++ /dev/null @@ -1,4 +0,0 @@ -基本使用概念 -============ - -TBD diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst deleted file mode 100644 index 78cca1e2a3443c2949ca0655190b0f05502f519a..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/concepts/index_en.rst +++ /dev/null @@ -1,4 +0,0 @@ -Concepts -============ - -TBD diff --git a/doc/fluid/getstarted/concepts/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md deleted file mode 100644 index 2cd4b6225b61cf374458e40afabad7745f61ba71..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/concepts/reader/README.md +++ /dev/null @@ -1,206 +0,0 @@ -# Python Data Reader Design Doc - -During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: - -- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. -- A *reader creator*: A function that returns a reader function. -- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. -- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. - -and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. - -## Data Reader Interface - -*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: - -``` -iterable = data_reader() -``` - -The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) - -An example implementation for single item data reader creator is as follows: - -```python -def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader -``` - -An example implementation for multiple item data reader creator is as follows: -```python -def reader_creator_random_image_and_label(width, height, label): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height), label - return reader -``` - -## Batch Reader Interface - -*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. - -Here are some valid outputs: - -```python -# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. -[(1, 1, 1), -(2, 2, 2), -(3, 3, 3)] - -# a mini batch of three data items, each data item is a list (single column). -[([1,1,1],), -([2,2,2],), -([3,3,3],)] -``` - -Please note that each item inside the list must be a tuple, below is an invalid output: -```python - # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three columns of data, each of which is 1. -[[1,1,1], -[2,2,2], -[3,3,3]] -``` - -It is easy to convert from a reader to a batch reader: - -```python -mnist_train = paddle.dataset.mnist.train() -mnist_train_batch_reader = paddle.batch(mnist_train, 128) -``` - -It is also straight forward to create a custom batch reader: - -```python -def custom_batch_reader(): - while True: - batch = [] - for i in xrange(128): - batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. - yield batch - -mnist_random_image_batch_reader = custom_batch_reader -``` - -## Usage - -Following is how we can use the reader with PaddlePaddle: -The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: - -```python -# two data layer is created: -image_layer = paddle.layer.data("image", ...) -label_layer = paddle.layer.data("label", ...) - -# ... -batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) -paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) -``` - -## Data Reader Decorator - -The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. - -Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: - -### Prefetch Data - -Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. - -Use `paddle.reader.buffered` to prefetch data: - -```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) -``` - -`buffered_reader` will try to buffer (prefetch) `100` data entries. - -### Compose Multiple Data Readers - -For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). - -We can do the following : - -```python -def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader - -def reader_creator_bool(t): - def reader: - while True: - yield t - return reader - -true_reader = reader_creator_bool(True) -false_reader = reader_creator_bool(False) - -reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care about the second item at this time. -paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) -``` - -### Shuffle - -Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. - -Example: -```python -reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) -``` - -## Q & A - -### Why does a reader return only a single entry, and not a mini batch? - -Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). - -We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. - -### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? - -In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. - -### Why use a dictionary instead of a list to provide mapping? - -Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). - -### How to create a custom data reader creator ? - -```python -def image_reader_creator(image_path, label_path, n): - def reader(): - f = open(image_path) - l = open(label_path) - images = numpy.fromfile( - f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') - images = images / 255.0 * 2.0 - 1.0 - labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") - for i in xrange(n): - yield images[i, :], labels[i] # a single entry of data is created each time - f.close() - l.close() - return reader - -# images_reader_creator creates a reader -reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) -``` - -### How is `paddle.train` implemented - -An example implementation of paddle.train is: - -```python -def train(batch_reader, mapping, batch_size, total_pass): - for pass_idx in range(total_pass): - for mini_batch in batch_reader(): # this loop will never end in online learning. - do_forward_backward(mini_batch, mapping) -``` diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md deleted file mode 100644 index 1f12ba0497369eacc6a2db7984781b5672f45ea1..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/concepts/save_model/model_format.md +++ /dev/null @@ -1,76 +0,0 @@ -# Design Doc: Model Format - -## Motivation - -A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. - -As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. - -## Implementation - -The topology is saved as a plain text in a detailed self-contain protobuf file. - -The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. - -As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, - -The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
field nametype description
version uint32_t Version of saved file. Always 0 now.
tensor desc length uint32_t TensorDesc(Protobuf message) length in bytes.
tensor desc void* TensorDesc protobuf binary message
tensor data void* Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()`
lod_level uint64_t Level of LoD
length of lod[0] uint64_t [Optional] length of lod[0] in bytes.
data of lod[0] uint64_t* [Optional] lod[0].data()
... ... ...
- -## Summary - -- We introduce a model format. -- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message. -- A bunch of specified format binary tensors describe the **parameters**. diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst deleted file mode 100644 index 3daea71d0933a2774227ff2b5e744392ca6b1765..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/index_cn.rst +++ /dev/null @@ -1,20 +0,0 @@ -新手入门 -============ - - -如果需要快速了解PaddlePaddle的使用,可以参考以下指南。 - -.. toctree:: - :maxdepth: 1 - - quickstart_cn.rst - - -在使用PaddlePaddle构建应用时,需要了解一些基本概念。 -这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。 - -.. toctree:: - :maxdepth: 1 - - concepts/use_concepts_cn.rst - developer's_guide_to_paddle_fluid.md diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst deleted file mode 100644 index fb20bb4f245281c3acf67c417979dc63c144fef3..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/index_en.rst +++ /dev/null @@ -1,19 +0,0 @@ -GET STARTED -============ - -If you want to quickly know how to use PaddlePaddle, please refer to the following guide: - -.. toctree:: - :maxdepth: 1 - - quickstart_en.rst - -While using PaddlePaddle to build applications, please understand some basic concepts. - -Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc. - -.. toctree:: - :maxdepth: 1 - - concepts/index_en.rst - developer's_guide_to_paddle_fluid.md diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst deleted file mode 100644 index 6a964d4f8561f30aa10936d2399698c51583442c..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/quickstart_cn.rst +++ /dev/null @@ -1,45 +0,0 @@ -快速开始 -======== - -快速安装 --------- - -PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 -执行下面的命令完成快速安装,版本为cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -更详细的安装和编译方法参考: :ref:`install_steps` 。 - -快速使用 --------- - -创建一个 housing.py 并粘贴此Python代码: - - .. code-block:: python - - import paddle.dataset.uci_housing as uci_housing - import paddle.fluid as fluid - - with fluid.scope_guard(fluid.core.Scope()): - # initialize executor with cpu - exe = fluid.Executor(place=fluid.CPUPlace()) - # load inference model - [inference_program, feed_target_names,fetch_targets] = \ - fluid.io.load_inference_model(uci_housing.fluid_model(), exe) - # run inference - result = exe.run(inference_program, - feed={feed_target_names[0]: uci_housing.predict_reader()}, - fetch_list=fetch_targets) - # print predicted price is $12,273.97 - print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000) - -执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst deleted file mode 100644 index 680122f25893a5a48fac103266bda4788f891f6d..0000000000000000000000000000000000000000 --- a/doc/fluid/getstarted/quickstart_en.rst +++ /dev/null @@ -1,49 +0,0 @@ -Quick Start -============ - -Quick Install -------------- - -You can use pip to install PaddlePaddle with a single command, supports -CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. -Simply run the following command to install, the version is cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -For more details about installation and build: :ref:`install_steps` . - -Quick Use ---------- - -Create a new file called housing.py, and paste this Python -code: - - - .. code-block:: python - - import paddle.dataset.uci_housing as uci_housing - import paddle.fluid as fluid - - with fluid.scope_guard(fluid.core.Scope()): - # initialize executor with cpu - exe = fluid.Executor(place=fluid.CPUPlace()) - # load inference model - [inference_program, feed_target_names,fetch_targets] = \ - fluid.io.load_inference_model(uci_housing.fluid_model(), exe) - # run inference - result = exe.run(inference_program, - feed={feed_target_names[0]: uci_housing.predict_reader()}, - fetch_list=fetch_targets) - # print predicted price is $12,273.97 - print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000) - -Run :code:`python housing.py` and voila! It should print out a list of predictions -for the test housing data. diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md deleted file mode 100644 index 55326940ce7c7dbaa5bf19f1950f470527ddf4f0..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md +++ /dev/null @@ -1,181 +0,0 @@ -# Fluid 分布式版本使用指南 -本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行,以及将单机训练脚本改造成支持集群训练的版本 - -## 准备工作 -* 可用的集群 - - 包含一个或多个计算节点的集群,每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址,集群内的所有计算节点可以通过网络相互通信。 -* 安装PaddlePaddle Fluid with Distribution版本 - - 所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。 - - **注意:**当前对外提供的PaddlePaddle版本并不支持分布式,需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。 - cmake编译命令中需要将WITH_DISTRIBUTE设置为ON,下面是一个cmake编译指令示例: -``` bash -cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON -``` - -## 更新训练脚本 -这里,我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例,描述如何将单机训练脚本改造成支持集群训练的版本。 -### 单机训练脚本示例 -```python -import paddle.v2 as paddle -import paddle.fluid as fluid - -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y_predict = fluid.layers.fc(input=x, size=1, act=None) -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -cost = fluid.layers.square_error_cost(input=y_predict, label=y) -avg_cost = fluid.layers.mean(x=cost) - -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -sgd_optimizer.minimize(avg_cost) - -BATCH_SIZE = 20 - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) - -exe.run(fluid.default_startup_program()) - -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - - if avg_loss_value[0] < 10.0: - exit(0) # if avg cost less than 10.0, we think our code is good. -exit(1) -``` - -我们创建了一个简单的全连接神经网络程序,并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。 -### 介绍Parameter Server -在非分布式版本的训练脚本中,只存在Trainer一种角色,它不仅处理常规的计算任务,也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中,由于存在多个Trainer节点进行同样的数据计算任务,因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中,我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md) - -**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。** - -### 分布式训练 -Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。 -```python -optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) -``` -将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下: -```python -... #define the program, cost, and create sgd optimizer - -optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters - -t = fluid.DistributeTranspiler() # create the transpiler instance -# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers -t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) - -... #create executor - -# in pserver, run this -#current_endpoint here means current pserver IP:PORT you wish to run on -pserver_prog = t.get_pserver_program(current_endpoint) -pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) -exe.run(pserver_startup) -exe.run(pserver_prog) - -# in trainer, run this -... # define data reader -exe.run(fluid.default_startup_program()) -for pass_id in range(100): - for data in train_reader(): - exe.run(t.get_trainer_program()) -``` -### 分布式训练脚本运行说明 -分布式任务的运行需要将表格中说明的多个参数进行赋值: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
参数名 值类型说明 示例
trainer_id int 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 0/1/2/3
pservers str parameter server 列表 127.0.0.1:6710,127.0.0.1:6711
trainers int 训练节点的总个数,>0的数字 4
server_endpoint str 当前所起的服务节点的IP:PORT 127.0.0.1:8789
training_rolestr 节点角色, TRAINER/PSERVER PSERVER
- - -**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下: - -```python -t = fluid.DistributeTranspiler() -t.transpile( - optimize_ops, - params_grads, - trainer_id, - pservers=pserver, - trainers=trainers) -if training_role == "PSERVER": - pserver_prog = t.get_pserver_program(server_endpoint) - pserver_startup = t.get_startup_program(server_endpoint, pserver_prog) -``` - -### Demo -完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。 - -第一步,进入demo代码所在目录: -```bash -cd /paddle/python/paddle/fluid/tests/book -``` - -第二步,启动Parameter Server: -```bash -PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py -``` -执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 - -第三步,启动Trainer: -```bash -PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py -``` -由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 - -现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。 diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_en.md b/doc/fluid/howto/cluster/fluid_cluster_train_en.md deleted file mode 100644 index b4465e8269c2e1603c02404ea33f8c4572e76442..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/cluster/fluid_cluster_train_en.md +++ /dev/null @@ -1,153 +0,0 @@ -# Fluid Distributed Training - -## Introduction - -In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster. - -## Preparations - -### Getting the cluster ready - -Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other. - -### Have PaddlePaddle installed - -PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries. - -PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html). - -In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows: - -``` bash -cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON -``` - -### Update the training script - -#### Non-cluster training script - -Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example. - -The non-cluster version of this demo with fluid API is as follows: - -``` python -import paddle.v2 as paddle -import paddle.fluid as fluid - -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y_predict = fluid.layers.fc(input=x, size=1, act=None) -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -cost = fluid.layers.square_error_cost(input=y_predict, label=y) -avg_cost = fluid.layers.mean(x=cost) - -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -sgd_optimizer.minimize(avg_cost) - -BATCH_SIZE = 20 - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) - -exe.run(fluid.default_startup_program()) - -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - fluid.io.save_persistables(exe, "./fit_a_line.model/") - fluid.io.load_persistables(exe, "./fit_a_line.model/") - for data in train_reader(): - avg_loss_value, = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - - if avg_loss_value[0] < 10.0: - exit(0) # if avg cost less than 10.0, we think our code is good. -exit(1) -``` - -We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes. - -Now let's try to convert it to a distributed version to run on a cluster. - -#### Introducing parameter server - -As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle. - -![parameter server architecture](src/trainer.png) - -Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md). - -Now we need to create programs for both: trainers and parameter servers, the question is how? - -#### Slice the program - -Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program. - -The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP. - -Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function. - -To put them together: - -``` python -... #define the program, cost, and create sgd optimizer - -optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters - -t = fluid.DistributeTranspiler() # create the transpiler instance -# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers -t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) - -... #create executor - -# in pserver, run this -#current_endpoint here means current pserver IP:PORT you wish to run on -pserver_prog = t.get_pserver_program(current_endpoint) -pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) -exe.run(pserver_startup) -exe.run(pserver_prog) - -# in trainer, run this -... # define data reader -exe.run(fluid.default_startup_program()) -for pass_id in range(100): - for data in train_reader(): - exe.run(t.get_trainer_program()) - - -``` - -### E2E demo - -Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py). -First `cd` into the folder that contains the `python` files. In this case: - -```bash -cd /paddle/python/paddle/fluid/tests/book_distribute -``` - -In parameter server node run the following in the command line: - -``` bash -PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py -``` - -*please note we assume that your parameter server runs at 192.168.1.2:6174* - -Wait until the prompt `Server listening on 192.168.1.2:6174` - -Then in 2 of your trainer nodes run this: - -``` bash -PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py -``` - -*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50* - -Now you have 2 trainers and 1 parameter server up and running. diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md deleted file mode 100644 index 92859e8f622d0c155128821c54252113c5016989..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/cluster/fluid_recordio.md +++ /dev/null @@ -1,127 +0,0 @@ -# How to use RecordIO in Fluid - -If you want to use RecordIO as your training data format, you need to convert to your training data -to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some -interface to deal with the RecordIO files. - -## Generate RecordIO File - -Before start training with RecordIO files, you need to convert your training data -to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes -as follows: - -```python - reader = paddle.batch(mnist.train(), batch_size=1) - feeder = fluid.DataFeeder( - feed_list=[ # order is image and label - fluid.layers.data( - name='image', shape=[784]), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder) -``` - -The above code snippet would generate a RecordIO `./mnist.recordio` on your host. - -**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can -adjust it flexibly while reading it. - -## Use the RecordIO file in a Local Training Job - -PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file -and then you can use them as a Layer in your network configuration, the sample codes as follows: - -```python - data_file = fluid.layers.io.open_recordio_file( - filename="./mnist.recordio", - shapes=[(-1, 784),(-1, 1)], - lod_levels=[0, 0], - dtypes=["float32", "int32"]) - data_file = fluid.layers.io.batch(data_file, batch_size=4) - - img, label = fluid.layers.io.read_file(data_file) - hidden = fluid.layers.fc(input=img, size=100, act='tanh') - prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) - - fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss) - - place = fluid.CPUPlace() - - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - avg_loss_np = [] - - # train a pass - batch_id = 0 - while True: - tmp, = exe.run(fetch_list=[avg_loss]) - - avg_loss_np.append(tmp) - print(batch_id) - batch_id += 1 -``` - -## Use the RecordIO files in Distributed Training - -1. generate multiple RecordIO files - -For a distributed training job, you may have multiple trainer nodes, -and one or more RecordIO files for one trainer node, you can use the interface -`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data -into multiple RecordIO files, the sample codes as follows: - -```python - reader = paddle.batch(mnist.train(), batch_size=1) - feeder = fluid.DataFeeder( - feed_list=[ # order is image and label - fluid.layers.data( - name='image', shape=[784]), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - fluid.recordio_writer.convert_reader_to_recordio_files( - filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder) -``` - -The above codes would generate multiple RecordIO files on your host like: - -```bash -. - \_mnist-00000.recordio - |-mnist-00001.recordio - |-mnist-00002.recordio - |-mnist-00003.recordio - |-mnist-00004.recordio -``` - -2. open multiple RecordIO files by `fluid.layers.io.open_files` - -For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes, -each trainer process reads parts of the whole training data, we usually take the following approach to make the training -data allocated by each trainer process as uniform as possiable: - -```python -def gen_train_list(file_pattern, trainers, trainer_id): - file_list = glob.glob(file_pattern) - ret_list = [] - for idx, f in enumerate(file_list): - if (idx + trainers) % trainers == trainer_id: - ret_list.append(f) - return ret_list - -trainers = int(os.getenv("PADDLE_TRAINERS")) -trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) -data_file = fluid.layers.io.open_files( - filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), - thread_num=1, - shapes=[(-1, 784),(-1, 1)], - lod_levels=[0, 0], - dtypes=["float32", "int32"]) -img, label = fluid.layers.io.read_file(data_files) -... -``` diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md deleted file mode 100644 index 8adaf324fccb4cda7af16b9bace559c0642ae444..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/cluster/nccl2_rdma_training.md +++ /dev/null @@ -1,110 +0,0 @@ -# Distributed Training with NCCL2 and RDMA - -When doing distributed multi-GPU training, network bandwidth often becomes the -bottleneck. We introduce a way to use NCCL2 to do such training job to -achieve best performance. - -## Prepare Hardware with RDMA and Multiple GPUs - -I'm using two Linux servers each of them installed with 8 GPUs and -one 100Gb RDMA card. -Base environment is: - -* OS: CentOS 7.4 -* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]" -* Kernel version: `4.4.88-1.el7.elrepo.x86_64` -* Docker version: `1.12.6` -* Docker storage driver: `overlay2` -* IP addresses: 192.168.16.30,192.168.16.34 - -In general, the steps including: - -1. Install GPU drivers -1. Install RDMA drivers -1. Install "InfiniBand Support" -1. Use docker to run tests and make sure GPUs and RDMA can work inside - the container. - -I'll omit the section "Install GPU drivers" because we can find it easily -somewhere else. - -### Install RDMA drivers - -For my case, I've got two machines with device -"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was -"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can -work with the latest overlay2 filesystem. - -***NOTE: before you start, make sure you have a way to get a console -of the server other than ssh because we may need to re-configure the -network device.*** - -1. Go to http://www.mellanox.com/page/products_dyn?product_family=26, - download `MLNX_OFED` software in the bottom of the page, and upload it - onto the server. -1. Run `./mlnxofedinstall --add-kernel-support` in the software package. -1. Run `/etc/init.d/openibd restart` to make everything work, note that - this operation may cause the network goes down if you are using this - RDMA device as default network device and use ssh to log in the server. -1. Re-configure the network interface, for example: - `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed: - `ip route add default via 192.168.16.1 dev eth2`. -1. Do the same thing on the other node. -1. Use `ping` to test if the two nodes have typical ICMP connection. -1. Use either `udaddy` or `ib_write_bw` to test the network connection is - ready and have the desired bandwidth. - -### Prepare Docker Image to Run RDMA Programs - -1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl - package in it. -1. Start a docker container and mount GPU driver libs into it (you can - skip this step if you are using nvidia-docker). -1. Mount RDMA drivers and libs into the docker image (see below section), - also `udaddy` and `ib_write_bw` if needed. -1. Mount GPU devices and RDMA devices into the container using `--device` - or just use privileged mode `--privileged`. -1. Start the container using host network mode: `--net=host` - -### RDMA Library Files Needed - -Usually, `MLNX_OFED` install latest supported libs under -`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs -is listed below. These libs must be mounted into the docker container. - -* Libs under `/usr/lib64/mlnx_ofed/valgrind` - * libibcm.so - * libibverbs.so - * libmlx4.so - * libmlx5.so - * libmlx5-rdmav2.so - * librdmacm.so -* Other libs: - * libnl-3.so.200 - * libnl-route-3.so.200 - * libnuma.so.1 - -## Start to Run the Training Job - -Setting NCCL environment variables to turn NCCL switches on and off: - - -| Env Name | Description | -| --- | --- | -| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 | -| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs | -| NCCL_IB_DISABLE | Set to 1 to disable using RDMA | -| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported | -| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO | - -My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run : - -```bash -PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py -``` - -On node 2, Run: - -```bash -PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py -``` diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst deleted file mode 100644 index b57af64f44da82926c4862578f3072960ca5aa92..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -进阶使用 ------------- - -.. toctree:: - :maxdepth: 1 - - inference/index_cn.rst - optimization/index_cn.rst diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst deleted file mode 100644 index fd21e167ce3a46da167db1e9d7013804f730e047..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -HOW TO ------------- - -.. toctree:: - :maxdepth: 1 - - optimization/index_en.rst diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst deleted file mode 100644 index 91357dd8c8da19f2f33c6f285ed7eb234428b1ab..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst +++ /dev/null @@ -1,97 +0,0 @@ -安装与编译C++预测库 -=========================== - -直接下载安装 -------------- - -====================== ======================================== -版本说明 C++预测库 -====================== ======================================== -cpu_avx_mkl `fluid.tgz `_ -cpu_avx_openblas `fluid.tgz `_ -cpu_noavx_openblas `fluid.tgz `_ -cuda7.5_cudnn5_avx_mkl `fluid.tgz `_ -cuda8.0_cudnn5_avx_mkl `fluid.tgz `_ -cuda8.0_cudnn7_avx_mkl `fluid.tgz `_ -cuda9.0_cudnn7_avx_mkl `fluid.tgz `_ -====================== ======================================== - -从源码编译 ----------- -用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项: - -================= ========= -选项 值 -================= ========= -CMAKE_BUILD_TYPE Release -FLUID_INSTALL_DIR 安装路径 -WITH_FLUID_ONLY ON(推荐) -WITH_SWIG_PY OFF(推荐 -WITH_PYTHON OFF(推荐) -WITH_GPU ON/OFF -WITH_MKL ON/OFF -================= ========= - -建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 - -下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): - - .. code-block:: bash - - pip install paddlepaddle-gpu - PADDLE_ROOT=/path/of/capi - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - mkdir build - cd build - cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_FLUID_ONLY=ON \ - -DWITH_SWIG_PY=OFF \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - .. - make - make inference_lib_dist - -成功编译后,使用C++预测库所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件;(3)版本信息与编译选项信息) -均会存放于PADDLE_ROOT目录中。目录结构如下: - - .. code-block:: text - - PaddleRoot/ - ├── CMakeCache.txt - ├── paddle - │   └── fluid - │   ├── framework - │   ├── inference - │   ├── memory - │   ├── platform - │   ├── pybind - │   └── string - ├── third_party - │   ├── boost - │   │   └── boost - │   ├── eigen3 - │   │   ├── Eigen - │   │   └── unsupported - │   └── install - │   ├── gflags - │   ├── glog - │   ├── mklml - │   ├── protobuf - │   ├── snappy - │   ├── snappystream - │   └── zlib - └── version.txt - -version.txt 中记录了该预测库的版本信息,包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号,如: - - .. code-block:: text - - GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8 - WITH_MKL: ON - WITH_GPU: ON - CUDA version: 8.0 - CUDNN version: v5 diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst deleted file mode 100644 index a903423548decd0992bf19772fb2cb143f6a12b5..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/inference/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -预测库 ------------- - -.. toctree:: - :maxdepth: 1 - - build_and_install_lib_cn.rst - inference_support_in_fluid_cn.md diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md deleted file mode 100644 index 309b17fccd5c461c9c22beb64eb4c6792b7e4a7a..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md +++ /dev/null @@ -1,304 +0,0 @@ -# 使用指南 - -## 目录: - -- Python Inference API -- Inference C++ API -- Inference实例 -- Inference计算优化 - -## Python Inference API **[改进中]** -- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295)) - - ```python - def save_inference_model(dirname, - feeded_var_names, - target_vars, - executor, - main_program=None, - model_filename=None, - params_filename=None): - ``` - Inference模型和参数将会保存到`dirname`目录下: - - 序列化的模型 - - `model_filename`为`None`,保存到`dirname/__model__` - - `model_filename`非`None`,保存到`dirname/model_filename` - - 参数 - - `params_filename`为`None`,单独保存到各个独立的文件,各文件以参数变量的名字命名 - - `params_filename`非`None`,保存到`dirname/params_filename` - -- 两种存储格式 - - 参数保存到各个独立的文件 - - 如,设置`model_filename`为`None`、`params_filename`为`None` - - ```bash - $ cd recognize_digits_conv.inference.model - $ ls - $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0 - ``` - - 参数保存到同一个文件 - - 如,设置`model_filename`为`None`、`params_filename`为`__params__` - - ```bash - $ cd recognize_digits_conv.inference.model - $ ls - $ __model__ __params__ - ``` -- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380)) - ```python - def load_inference_model(dirname, - executor, - model_filename=None, - params_filename=None): - ... - return [program, feed_target_names, fetch_targets] - ``` - -## 链接Fluid Inference库 -- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git)) - - - GCC配置 - ```bash - $ g++ -o a.out -std=c++11 main.cc \ - -I${PADDLE_ROOT}/ \ - -I${PADDLE_ROOT}/third_party/install/gflags/include \ - -I${PADDLE_ROOT}/third_party/install/glog/include \ - -I${PADDLE_ROOT}/third_party/install/protobuf/include \ - -I${PADDLE_ROOT}/third_party/eigen3 \ - -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \ - -lrt -ldl -lpthread - ``` - - - CMake配置 - ```cmake - include_directories(${PADDLE_ROOT}/) - include_directories(${PADDLE_ROOT}/third_party/install/gflags/include) - include_directories(${PADDLE_ROOT}/third_party/install/glog/include) - include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include) - include_directories(${PADDLE_ROOT}/third_party/eigen3) - target_link_libraries(${TARGET_NAME} - ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so - -lrt -ldl -lpthread) - ``` - - - 设置环境变量: - `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH` - - - -## C++ Inference API - -- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91)) - - - 1、 初始化设备 - ```cpp - #include "paddle/fluid/framework/init.h" - paddle::framework::InitDevices(false); - ``` - - - 2、 定义place,executor,scope - ```cpp - auto place = paddle::platform::CPUPlace(); - auto executor = paddle::framework::Executor(place); - auto* scope = new paddle::framework::Scope(); - ``` - - - 3、 加载模型 - ```cpp - #include "paddle/fluid/inference/io.h" - auto inference_program = paddle::inference::Load(executor, *scope, dirname); - // or - auto inference_program = paddle::inference::Load(executor, - *scope, - dirname + "/" + model_filename, - dirname + "/" + params_filename); - ``` - - - 4、 获取`feed_target_names`和`fetch_target_names` - ```cpp - const std::vector& feed_target_names = inference_program->GetFeedTargetNames(); - const std::vector& fetch_target_names = inference_program->GetFetchTargetNames(); - ``` - - - 5、 准备`feed`数据 - ```cpp - #include "paddle/fluid/framework/lod_tensor.h" - std::vector cpu_feeds; - ... - std::map feed_targets; - for (size_t i = 0; i < feed_target_names.size(); ++i) { - // Please make sure that cpu_feeds[i] is right for feed_target_names[i] - feed_targets[feed_target_names[i]] = cpu_feeds[i]; - } - ``` - - - 6、 定义`Tensor`来`fetch`结果 - ```cpp - std::vector cpu_fetchs; - std::map fetch_targets; - for (size_t i = 0; i < fetch_target_names.size(); ++i) { - fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; - } - ``` - - - 7、 执行`inference_program` - ```cpp - executor.Run(*inference_program, scope, feed_targets, fetch_targets); - ``` - - - 8、 使用`fetch`数据 - ```cpp - for (size_t i = 0; i < cpu_fetchs.size(); ++i) { - std::cout << "lod_i: " << cpu_fetchs[i]->lod(); - std::cout << "dims_i: " << cpu_fetchs[i]->dims(); - std::cout << "result:"; - float* output_ptr = cpu_fetchs[i]->data(); - for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) { - std::cout << " " << output_ptr[j]; - } - std::cout << std::endl; - } - ``` - 针对不同的数据,4. - 8.可执行多次。 - - - 9、 释放内存 - ```cpp - delete scope; - ``` - - -- 接口说明 - - ```cpp - void Run(const ProgramDesc& program, Scope* scope, - std::map& feed_targets, - std::map& fetch_targets, - bool create_vars = true, - const std::string& feed_holder_name = "feed", - const std::string& fetch_holder_name = "fetch"); - ``` - - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`,用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。 - - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致,可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program` - - 默认情况下,除了`persistable`属性设置为`True`的`Variable`之外,每次执行`executor.Run`会创建一个局部`Scope`,并且在这个局部`Scope`中创建和销毁所有的`Variable`,以最小化空闲时的内存占用。 - - `persistable`属性为`True`的`Variable`有: - - Operators的参数`w`、`b`等 - - `feed_op`的输入变量 - - `fetch_op`的输出变量 - - -- **不在每次执行时创建和销毁变量 - ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))** - - 执行`inference_program` - ```cpp - // Call once - executor.CreateVariables(*inference_program, scope, 0); - // Call as many times as you like - executor.Run( - *inference_program, scope, feed_targets, fetch_targets, false); - ``` - - **优点** - - 节省了频繁创建、销毁变量的时间(约占每次`Run`总时间的1% ~ 12%) - - 执行结束后可获取所有Operators的计算结果 - - **缺点** - - 空闲时也会占用大量的内存 - - 在同一个`Scope`中,相同的变量名是公用同一块内存的,容易引起意想不到的错误 - - -- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))** - - 执行`inference_program` - ```cpp - // Call once - auto ctx = executor.Prepare(*inference_program, 0); - // Call as many times as you like if you have no need to change the inference_program - executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets); - ``` - - **优点** - - 节省了频繁创建、销毁Op的时间 - - **缺点** - - 一旦修改了`inference_program`,则需要重新创建`ctx` - - -- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))** - - 主线程 - - 1、 初始化设备 - - 2、 定义`place`,`executor`,`scope` - - 3、 加载模型,得到`inference_program` - - 从线程 - - **复制`inference_program`得到`copy_program`,修改`copy_program`的`feed_holder_name`和`fetch_holder_name`** - ```cpp - auto copy_program = std::unique_ptr( - new paddle::framework::ProgramDesc(*inference_program)); - std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id); - std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id); - copy_program->SetFeedHolderName(feed_holder_name); - copy_program->SetFetchHolderName(fetch_holder_name); - ``` - - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names` - - 5、 准备feed数据,定义Tensor来fetch结果 - - 6、 执行`copy_program` - ```cpp - executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name); - ``` - - 7、 使用fetch数据 - - 主线程 - - 8、 释放资源 - - -- 基本概念 - - 数据相关: - - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md),一个N维数组,数据可以是任意类型(int,float,double等) - - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md),带LoD(Level-of-Detail)即序列信息的Tensor - - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md),记录了变量Variable - - 执行相关: - - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md),无状态执行器,只跟设备相关 - - Place - - CPUPlace,CPU设备 - - CUDAPlace,CUDA GPU设备 - - 神经网络表示: - - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). - - 详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md) - - - -## Inference实例 - - 1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc) - 1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc) - 1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc) - 1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc) - 1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc) - 1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc) - 1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc) - - -## Inference计算优化 -- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py)) - ```python - class InferenceTranspiler: - def transpile(self, program, place, scope=None): - ... - if scope is None: - scope = global_scope() - ... - ``` - - 使用`InferenceTranspiler`将会直接修改`program`。 - - 使用`InferenceTranspiler`会修改参数的值,请确保`program`的参数在`scope`内。 -- 支持的优化 - - 融合batch_norm op的计算 -- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py)) - ```python - import paddle.fluid as fluid - # NOTE: Applying the inference transpiler will change the inference_program. - t = fluid.InferenceTranspiler() - t.transpile(inference_program, place, inference_scope) - ``` - - - - -## 内存使用优化 -- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py)) - ```python - fluid.memory_optimize(inference_program) - ``` diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst deleted file mode 100644 index 9404800eb86ca6d27886258b67393028c76954dc..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/benchmark/index_cn.rst +++ /dev/null @@ -1,8 +0,0 @@ -基准 ------------- - -.. toctree:: - :maxdepth: 1 - - vgg16/README.md - README.md diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst deleted file mode 100644 index 1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/benchmark/index_en.rst +++ /dev/null @@ -1,8 +0,0 @@ -Benchmark ------------- - -.. toctree:: - :maxdepth: 1 - - vgg16/README.md - README.md diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md deleted file mode 100644 index 198a05a79e19227e90eaafe116217a164cd51a7d..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/cpu_profiling_cn.md +++ /dev/null @@ -1,183 +0,0 @@ -# CPU性能调优 - -此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 - -Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 - -PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分: - -* Python 代码的性能分析 -* Python 与 C++ 混合代码的性能分析 - - -## Python代码的性能分析 - -### 生成性能分析文件 - -Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: - -```bash -python -m cProfile -o profile.out main.py -``` - -其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。 - -### 查看性能分析文件 - -`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来: - -```bash -cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py -``` - -其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 - -用Web浏览器访问对应网址,即可显示性能分析的结果: - -``` - ncalls tottime percall cumtime percall filename:lineno(function) - 1 0.284 0.284 29.514 29.514 main.py:1() - 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run) - 4696 12.040 0.003 12.040 0.003 {built-in method run} - 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() -``` - -每一列的含义是: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
列名含义
ncalls 函数的调用次数
tottime 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间
percall tottime的每次调用平均时间
cumtime 函数总时间。包含这个函数调用其他函数的时间
percall cumtime的每次调用平均时间
filename:lineno(function) 文件名, 行号,函数名
- - -### 寻找性能瓶颈 - -通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 - -将性能分析结果按照tottime排序,效果如下: - -```text - 4696 12.040 0.003 12.040 0.003 {built-in method run} - 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) - 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__) - 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) - 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1() -``` - -可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 - -```text -Called By: - - Ordered by: internal time - List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> - -Function was called by... - ncalls tottime cumtime -/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) -/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone) - 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward) - - -Called: - - Ordered by: internal time - List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> -``` - -通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 - - - -## Python与C++混合代码的性能分析 - -### 生成性能分析文件 - -C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 - -使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 - -```bash -apt update -apt install libgoogle-perftools-dev -pip install yep -``` - -安装完毕后,我们可以通过 - -```bash -python -m yep -v main.py -``` - -生成性能分析文件。生成的性能分析文件为`main.py.prof`。 - -命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: - -1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 -2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 -3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 - -### 查看性能分析文件 - -在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 - -安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: - -```bash -go get github.com/google/pprof -``` - -进而我们可以使用如下命令开启一个HTTP服务: - -```bash -pprof -http=0.0.0.0:3213 `which python` ./main.py.prof -``` - -这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 - -访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: - -![result](./pprof_1.png) - - -### 寻找性能瓶颈 - -与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 - -例如下图中, - -![kernel_perf](./pprof_2.png) - -在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 - -在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md deleted file mode 100644 index 216694965b3c878a8a5f3ccd2a0cba8d21d9ce05..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/cpu_profiling_en.md +++ /dev/null @@ -1,224 +0,0 @@ -# Tune CPU performance - -This tutorial introduces techniques we use to profile and tune the -CPU performance of PaddlePaddle. We will use Python packages -`cProfile` and `yep`, and Google's `perftools`. - -Profiling is the process that reveals performance bottlenecks, -which could be very different from what's in the developers' mind. -Performance tuning is done to fix these bottlenecks. Performance optimization -repeats the steps of profiling and tuning alternatively. - -PaddlePaddle users program AI applications by calling the Python API, which calls -into `libpaddle.so.` written in C++. In this tutorial, we focus on -the profiling and tuning of - -1. the Python code and -1. the mixture of Python and C++ code. - -## Profiling the Python Code - -### Generate the Performance Profiling File - -We can use Python standard -package, [`cProfile`](https://docs.python.org/2/library/profile.html), -to generate Python profiling file. For example: - -```bash -python -m cProfile -o profile.out main.py -``` - -where `main.py` is the program we are going to profile, `-o` specifies -the output file. Without `-o`, `cProfile` would outputs to standard -output. - -### Look into the Profiling File - -`cProfile` generates `profile.out` after `main.py` completes. We can -use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into -the details: - -```bash -cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py -``` - -where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` -specifies the profiling file, and `main.py` is the source file. - -Open the Web browser and points to the local IP and the specifies -port, we will see the output like the following: - -``` - ncalls tottime percall cumtime percall filename:lineno(function) - 1 0.284 0.284 29.514 29.514 main.py:1() - 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run) - 4696 12.040 0.003 12.040 0.003 {built-in method run} - 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() -``` - -where each line corresponds to Python function, and the meaning of -each column is as follows: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
columnmeaning
ncalls the number of calls into a function
tottime the total execution time of the function, not including the execution time of other functions called by the function
percall tottime divided by ncalls
cumtime the total execution time of the function, including the execution time of other functions being called
percall cumtime divided by ncalls
filename:lineno(function) where the function is define
- -### Identify Performance Bottlenecks - -Usually, `tottime` and the related `percall` time is what we want to -focus on. We can sort above profiling file by tottime: - -```text - 4696 12.040 0.003 12.040 0.003 {built-in method run} - 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) - 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__) - 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) - 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1() -``` - -We can see that the most time-consuming function is the `built-in -method run`, which is a C++ function in `libpaddle.so`. We will -explain how to profile C++ code in the next section. At this -moment, let's look into the third function `sync_with_cpp`, which is a -Python function. We can click it to understand more about it: - -``` -Called By: - - Ordered by: internal time - List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> - -Function was called by... - ncalls tottime cumtime -/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) -/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone) - 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward) - - -Called: - - Ordered by: internal time - List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> -``` - -The lists of the callers of `sync_with_cpp` might help us understand -how to improve the function definition. - -## Profiling Python and C++ Code - -### Generate the Profiling File - -To profile a mixture of Python and C++ code, we can use a Python -package, `yep`, that can work with Google's `perftools`, which is a -commonly-used profiler for C/C++ code. - -In Ubuntu systems, we can install `yep` and `perftools` by running the -following commands: - -```bash -apt update -apt install libgoogle-perftools-dev -pip install yep -``` - -Then we can run the following command - -```bash -python -m yep -v main.py -``` - -to generate the profiling file. The default filename is -`main.py.prof`. - -Please be aware of the `-v` command line option, which prints the -analysis results after generating the profiling file. By examining the - the print result, we'd know that if we stripped debug -information from `libpaddle.so` at build time. The following hints -help make sure that the analysis results are readable: - -1. Use GCC command line option `-g` when building `libpaddle.so` so to - include the debug information. The standard building system of - PaddlePaddle is CMake, so you might want to set - `CMAKE_BUILD_TYPE=RelWithDebInfo`. - -1. Use GCC command line option `-O2` or `-O3` to generate optimized - binary code. It doesn't make sense to profile `libpaddle.so` - without optimization, because it would anyway run slowly. - -1. Profiling the single-threaded binary file before the - multi-threading version, because the latter often generates tangled - profiling analysis result. You might want to set environment - variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically - starting multiple threads. - -### Examining the Profiling File - -The tool we used to examine the profiling file generated by -`perftools` is [`pprof`](https://github.com/google/pprof), which -provides a Web-based GUI like `cprofilev`. - -We can rely on the standard Go toolchain to retrieve the source code -of `pprof` and build it: - -```bash -go get github.com/google/pprof -``` - -Then we can use it to profile `main.py.prof` generated in the previous -section: - -```bash -pprof -http=0.0.0.0:3213 `which python` ./main.py.prof -``` - -Where `-http` specifies the IP and port of the HTTP service. -Directing our Web browser to the service, we would see something like -the following: - -![result](./pprof_1.png) - -### Identifying the Performance Bottlenecks - -Similar to how we work with `cprofilev`, we'd focus on `tottime` and -`cumtime`. - -![kernel_perf](./pprof_2.png) - -We can see that the execution time of multiplication and the computing -of the gradient of multiplication takes 2% to 4% of the total running -time, and `MomentumOp` takes about 17%. Obviously, we'd want to -optimize `MomentumOp`. - -`pprof` would mark performance critical parts of the program in -red. It's a good idea to follow the hints. diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md deleted file mode 100644 index 7fb0883dd937465d15479b29df95078edb50e069..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md +++ /dev/null @@ -1,89 +0,0 @@ -# 堆内存分析和优化 - -计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。 - - -目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。 - -因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。 - -本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。 - -gperftool主要支持以下四个功能: - -- thread-caching malloc -- heap-checking using tcmalloc -- heap-profiling using tcmalloc -- CPU profiler - -Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。 - -对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。 - -## 环境 - -本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。 - -## 使用流程 - -- 安装google-perftools - -``` -apt-get install libunwind-dev -apt-get install google-perftools -``` - -- 安装pprof - -``` -go get -u github.com/google/pprof -``` - -- 设置运行环境 - -``` -export PPROF_PATH=/root/gopath/bin/pprof -export PPROF_BINARY_PATH=/root/gopath/bin/pprof -export LD_PRELOAD=/usr/lib/libtcmalloc.so.4 -``` - -- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。 - -``` -# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀 -# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB -env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py -``` - -随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下: - -``` --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap --rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap -``` - -- 使用pprof对heap文件进行分析。分析有两种模式: - - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。 - - ``` - pprof --pdf python test.log.0012.heap - ``` - 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。 - - ![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png) - - - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。 - ``` - pprof --pdf --base test.log.0010.heap python test.log.1045.heap - ``` - 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf) - - 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。 - - ![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png) - ![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png) - diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst deleted file mode 100644 index 27cc96702356703b339db845dc81913bdcc9f23b..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -性能优化 ------------- - -.. toctree:: - :maxdepth: 1 - - timeline.md - cpu_profiling_cn.md - benchmark/index_cn.rst diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst deleted file mode 100644 index 4ce624fe8f108a6afc7cd08a1542332755d22e04..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -Performance Optimization ---------------------------- - -.. toctree:: - :maxdepth: 1 - - timeline.md - cpu_profiling_en.md - benchmark/index_en.rst diff --git a/doc/fluid/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png deleted file mode 100644 index 8e9edbf377672d0ef40f2fc7bd39e746923550cb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/optimization/pprof_1.png and /dev/null differ diff --git a/doc/fluid/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png deleted file mode 100644 index 172ba20399ba974d27f4c072425277b69b02520b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/optimization/pprof_2.png and /dev/null differ diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg deleted file mode 100644 index 38ec3f80c982857531f30a8bb0fa26ea5bf05385..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/optimization/timeline.jpeg and /dev/null differ diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md deleted file mode 100644 index faf39f276dbddcd4961407ba2d082c9826051cbe..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/timeline_cn.md +++ /dev/null @@ -1,32 +0,0 @@ -# 如何使用timeline工具做性能分析 - -1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 - - **提示:** - 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。 - - ```python - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - if pass_id == 0 and batch_id == 5: - profiler.start_profiler("All") - elif pass_id == 0 and batch_id == 10: - profiler.stop_profiler("total", "/tmp/profile") - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[]) - ... - ``` - -1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。 -```python -python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline -``` - -1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。 - - ![chrome tracing](./tracing.jpeg) - -1. 结果如下图所示,可以放到来查看timetime的细节信息。 - - ![chrome timeline](./timeline.jpeg) diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md deleted file mode 100644 index 6f963c6b4da6967fb2f493ada917a4b08917fa4c..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/optimization/timeline_en.md +++ /dev/null @@ -1,33 +0,0 @@ -# how to use timeline tool to do profile - -1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. - - ```python - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - if pass_id == 0 and batch_id == 5: - profiler.start_profiler("All") - elif pass_id == 0 and batch_id == 10: - profiler.stop_profiler("total", "/tmp/profile") - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[]) - ... - ``` - -1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another -file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at -[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. - -```python -python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline -``` - -1. Open chrome and visit , use `load` button to load the generated `timeline` file. - - ![chrome tracing](./tracing.jpeg) - -1. The resulting timeline should be like: - - - ![chrome timeline](./timeline.jpeg) diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg deleted file mode 100644 index 3a49fc4f8a401a9463b0157e2f38c164ca02dcc5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/optimization/tracing.jpeg and /dev/null differ diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md deleted file mode 100644 index 749cf7693c75696feb17f8556224ed03649baa80..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/performance/error_clip.md +++ /dev/null @@ -1,92 +0,0 @@ -# Error Clip - -## Overview - -Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary. -## Usage - -Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor: - -```python -var = framework.Variable(..., error_clip=myErrorClip, ...) -``` - -The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is: - -```python -ErrorClipByValue(max, min=None) -``` - -`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically. - -So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by: - -```python -var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...) -``` - -## Implementation - -The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*. - -```python -class BaseErrorClipAttr(object): - def append_clip_op(self, block, grad_name): - raise NotImplementedError() - - -class ErrorClipByValue(BaseErrorClipAttr): - def __init__(self, max, min=None): - max = float(max) - if min is None: - min = -max - else: - min = float(min) - self.max = max - self.min = min - - def append_clip_op(self, block, grad_name): - clip_op_desc = block.desc.append_op() - clip_op_desc.set_type("clip") - clip_op_desc.set_input("X", [grad_name]) - clip_op_desc.set_output("Out", [grad_name]) - clip_op_desc.set_attr("min", self.min) - clip_op_desc.set_attr("max", self.max) -``` - -The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`. - -This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function. - -These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added. - -```python -for op_desc in grad_op_descs: - new_op_desc = target_block.desc.append_op() - new_op_desc.copy_from(op_desc) - callback(block=target_block, context=grad_to_var) -``` - -Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function. - -The callback function for `clip_op` appending is defined in *clip.py*: - -```python -def error_clip_callback(block, context): - # the context is a grad_to_var map - grad_to_var = context - op_desc = block.desc.op(block.desc.op_size() - 1) - for grad_n in filter(lambda n: grad_to_var.has_key(n), - op_desc.output_arg_names()): - fwd_var = block.__var_recursive(grad_to_var[grad_n]) - error_clip = getattr(fwd_var, "error_clip", None) - if not (error_clip is None or isinstance(error_clip, - BaseErrorClipAttr)): - raise TypeError( - "Variable's error_clip should be an instance of BaseErrorClipAttr or None." - ) - if error_clip is not None: - error_clip.append_clip_op(block, grad_n) -``` - -This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`. diff --git a/doc/fluid/howto/performance/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png deleted file mode 100644 index d57b71ca88aaba5d05584a6219d84214e285a1e1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/performance/images/profiler.png and /dev/null differ diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md deleted file mode 100644 index ee96e7c74ce317caddb387cbb1d4998937bd5c81..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/performance/profiler.md +++ /dev/null @@ -1,97 +0,0 @@ -## Introduction - -There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool. - -## Architecture - -The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report. - -```python -for i in xrange(M): # M is the iteration number - for op in operator_lists: # The `operator_lists` contains all the operators in the network. - op.run(); -``` - -In summary, the proflier should have following features: - -- records time span in loop. -- supports nested time span. -- supports multiple threads/multiple GPUs. -- supports to be enabled and disabled by users. - -But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events. - -The overall flow is shown as the following figure. - -
- -### Event - -In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event: - -```c++ -enum EventKind { - kMark, - kPushRange, - kPopRange}; -``` -- kMark: only a marker without time range. -- kPushRange: mark the starting event for time range. -- kPopRange: mark the ending event for time range. - -For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece. - -```c++ -class Event { - public: - // The DeviceContext is used to get current CUDA stream. - Event(EventKind kind, std::string name, uint32_t thread_id, - const platform::DeviceContext* dev_ctx = nullptr); - double CpuElapsedUs(const Event& e) const; - double CudaElapsedUs(const Event& e) const; - - private: - EventKind kind_; - std::string name_; - uint32_t thread_id_; - int64_t cpu_ns_; -#ifdef PADDLE_WITH_CUDA - cudaEvent_t event_ = nullptr; - int device_ = -1; -#endif -}; - -struct EventList { - std::forward_list> event_blocks; -}; -``` - -As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. - -```c++ -enum ProfilerState { - kDisabled, - kCPU, - kCUDA -}; -ProfilerState g_state; -``` -- kDisabled: the disabled state. -- kCPU: CPU profiling state. -- kCUDA: GPU profiling state. - -A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. - -```c++ -struct RecordEvent { - explicit RecordEvent(const std::string name, - platform::DeviceContext* dev_ctx = nullptr) { - if (kState == ProfilerState::kDisabled) return; - // push the starting event to the event lists. - } - ~RecordEvent() { - if (kState == ProfilerState::kDisabled) return; - // push the ending event to the event lists. - } -}; -``` diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle deleted file mode 100644 index cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle and /dev/null differ diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png deleted file mode 100644 index 87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/third_party/images/multigpu_allreduce.png and /dev/null differ diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle deleted file mode 100644 index 6c35ab1b21fb76ceae82d3693ed0d085b5bc0855..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle and /dev/null differ diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png deleted file mode 100644 index 9c8f7711165d80a2fa3911280fdee91855a401b1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/howto/third_party/images/multigpu_before_convert.png and /dev/null differ diff --git a/doc/fluid/howto/third_party/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md deleted file mode 100644 index bef126f3f0577b69f646dfe5d10539b372c6a8a5..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/third_party/mkldnn_fluid.md +++ /dev/null @@ -1,149 +0,0 @@ -# Design Doc: Add MKLDNN Kernel in Fluid Operator - -## Principles - -First of all, we should follow some basical principles like: -1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc. -2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h). -3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels. -4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`. - -## Sulution - -In general, there are four parts we should follow to run a MKL-DNN primitive. -- Create a primitive descriptor that describe this operator -- Create a primitive itself by primitive descriptor and the engine -- Create all memory buffers that primitive needed -- Launch a stream to execute the primitive created -More details can refer to [here](http://01org.github.io/mkl-dnn). - -It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \ -So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822). - -It's assumed that following three conditions should be satisfied. -1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`. -2. the `Input Tensor` inside `Compute` function is the one after converted. -3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user. - -### Compute -The algorithm of `Compute` would be described as follow, let's take conv like an example. - -```c++ - - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library."); - - auto& dev_ctx = ctx.template device_context(); - - // find primitive by unique key from mkldnn context - // the op_key should be a unique name of this op instance - auto& p = dev_ctx.findPrimitive(op_key + "_fwd"); - - // assuming the input tensor inside this compute function is the one after converted - // this point should be guarantee by another mechanism - auto& i = dev_ctx.findMemory(op_key + "_input"); - - if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) { - auto fwd_primitive_desc = createPrimitiveDesc(ctx); - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data())); - shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data())); - shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace()))); - shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out)); - - dev_ctx.addMemory(op_key+"_input", in); - dev_ctx.addMemory(op_key+"_output", out); - dev_ctx.addMemory(op_key+"_filer", wgt); - dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive); - dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); - } - - p = dev_ctx.findPrimitive(op_key + "_fwd"); - - PADDLE_ENFORCE(p, "Should have forward Primitive"); - PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); - PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); - PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); - PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); - dev_ctx.submit(p); - dev_ctx.execute(); // the convert primitive should have already contained. - -``` - -The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this: -```c++ - auto* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - algorithm algo = static_cast(ctx.Attr("convolution_algorithm_option")); - prop_kind pk = ctx.Attr("is_test") ? prop_kind::forward_inference : prop_kind::forward_training; - - auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/); - shared_ptr fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine())); - - return fwd_primitive_desc; - } -``` - -### MKLDNNDeviceContext -`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed. - - -### mkldnn_helper -Some functions would be put in `paddle/platform/mkldnn_helper.h`. -- create MKLDNN memories -- create MKLDNN primitives -- error check function -- etc - - -### Kernel Switch -We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it. - -`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. -`trans` would be like this: - -```c++ -void trans(inputs, ctx) override { - if (NoNeedTrans()) { - return; - } - // find reorder primitive by op_key from context - auto& dev_ctx = ctx.template device_context(); - auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input"); - auto& i = dev_ctx.findMemory(op_key + "_src_input"); - - if (p == nullptr || i == nullptr || changeSized(i, input)) { - auto prim = createPrimitiveDesc(ctx); - auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data); - auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes()); - auto dst = createMemory(p->expected_desc(), newbuffer->data); - auto reorder_primitive(new mkldnn::reorder(src, dst)); - - dev_ctx.addMemory(op_key+"_src_input", src); - dev_ctx.addMemory(op_key+"_input", dst); - dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); - } - - p = dev_ctx.findPrimitive(op_key + "_reorder_input"); - PADDLE_ENFORCE(p, "Should have Reorder Primitive"); - dev_ctx.submit(p); - if (! this->isMKLDNNKernel()) { - // execute immediately only if this is not mkldnn kernel function. - // otherwise, it can be executed with the operator primitive in Compute - dev_ctx.stream(); - } - // after submit, the input tensor in ExecutionContext should be changed as the converted one - // there should be another mechanism to ensure this -} -``` - -### Unit Test -All the functions should be tested corresponding. -TBD diff --git a/doc/fluid/howto/third_party/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md deleted file mode 100644 index c7dac70998a6cfec3a6d2fc72b698ff9722e6805..0000000000000000000000000000000000000000 --- a/doc/fluid/howto/third_party/paddle_nccl.md +++ /dev/null @@ -1,65 +0,0 @@ -# Design Doc: NCCL support in Paddle Fluid - -## Abstract - -This Design Doc refers to the NCCL feature in paddle. We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script. - - -## Motivation - -[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. - -- Pros -1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library. -1. high performance in NVIDIA GPUs. -1. MPI like primitives, which have low learning cost for users. - -- Cons -1. Only design for NVIDIA GPUs, not a general multi-device solution. -1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore. - -At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests. - -As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information. - -Besides, it needs interfaces to synchronize model update with each different GPU Cards. - -## Implementation - -As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created. - -### Transpiler - -To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices. - -1. The user-defined model will be a single device program - -2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator. - - *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines* - - - -After compiling, the graph as shows - - - -Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. - -- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU. -- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based communicating method, avoid of the bottle neck in a single GPU. - -Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph. - -As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`. - -- **AllReduce** - Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is -1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs. -2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs. -3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients. -4. Then the root card will optimize the parameter. -5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one. -6. Finish the sychronization round. - -The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase. diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png deleted file mode 100644 index 67daf566f91aab570e60971c4ea8e2be876e214d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/1.png and /dev/null differ diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png deleted file mode 100644 index 43367777f41449a666e7a3b571f09ac5d5dfb1ae..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/2.png and /dev/null differ diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot deleted file mode 100644 index 5d77865061ca7bbbfcf254dd938f09aef5553505..0000000000000000000000000000000000000000 --- a/doc/fluid/images/2_level_rnn.dot +++ /dev/null @@ -1,56 +0,0 @@ -digraph G { - - rnn [label="1st level RNN" shape=box] - - subgraph cluster0 { - label = "time step 0" - - sent0 [label="sentence"] - sent1 [label="sentence"] - - rnn1 [label="2nd level RNN" shape=box] - - sent0 -> rnn1 - sent1 -> rnn1 - } - - subgraph cluster1 { - label = "time step 1" - - sent2 [label="sentence"] - sent3 [label="sentence"] - - rnn2 [label="2nd level RNN" shape=box] - - sent2 -> rnn2 - sent3 -> rnn2 - } - - subgraph cluster2 { - label = "time step 2" - - sent4 [label="sentence"] - sent5 [label="sentence"] - - rnn3 [label="2nd level RNN" shape=box] - - sent4 -> rnn3 - sent5 -> rnn3 - } - - - para0 [label="paragraph info 0"] - para1 [label="paragraph info 1"] - para2 [label="paragraph info 2"] - - rnn1 -> para0 - rnn2 -> para1 - rnn3 -> para2 - - para0 -> rnn - para1 -> rnn - para2 -> rnn - - chapter [label="chapter info"] - rnn -> chapter -} diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png deleted file mode 100644 index 0537a75beb175c0c284717421f7aa908da2a5038..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/2_level_rnn.png and /dev/null differ diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png deleted file mode 100644 index 481021ef306e2596818aab7fe17a570754f63635..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/3.png and /dev/null differ diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png deleted file mode 100644 index 4279f41e06de459f18b9a622539511d555e9a0af..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/4.png and /dev/null differ diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg deleted file mode 100644 index 8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg and /dev/null differ diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png deleted file mode 100644 index 75369f5378309e0f304b83f6bb69bdb195eac079..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/LoDTensor.png and /dev/null differ diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif deleted file mode 100644 index 4a0da7bf6df9326a2aab1638b77c5455c18b8c4e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/asgd.gif and /dev/null differ diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot deleted file mode 100644 index 4bc47713cba2cb23f1b34fffe6426ef10ac3a9df..0000000000000000000000000000000000000000 --- a/doc/fluid/images/batch_norm_fork.dot +++ /dev/null @@ -1,25 +0,0 @@ -digraph ImageBatchNormForkGragh { - subgraph cluster_before { - Prev [label="...", shape=plaintext]; - Rnn [label="rnn_op", shape=box]; - BatchNorm [label="batch_norm_op", shape=box]; - Fc [label="fc_op", shape=box]; - After [label="...", shape=plaintext]; - Prev -> Rnn -> BatchNorm -> Fc -> After; - label="original"; - } - - subgraph cluster_after { - Prev2 [label="...", shape=plaintext]; - Rnn2 [label="rnn_op", shape=box]; - BatchNorm2_1 [label="train_batch_norm_op", shape=box]; - BatchNorm2_2 [label="infer_batch_norm_op", shape=box]; - Fc2_1 [label="fc_op", shape=box]; - Fc2_2 [label="fc_op", shape=box]; - After2_1 [label="...", shape=plaintext]; - After2_2 [label="...", shape=plaintext]; - Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1; - Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2 - label="forked"; - } -} diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png deleted file mode 100644 index aded62bce5bc268b7a3ef4dc96c89fe21d6ea955..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/batch_norm_fork.png and /dev/null differ diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png deleted file mode 100644 index a99ce81ff3bf42880ebbd6a1297de3bf038e09b2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/batch_norm_op_kernel.png and /dev/null differ diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png deleted file mode 100644 index 7f7e35f34223162d0f7f0ed97375909c43b830ae..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/beam_search.png and /dev/null differ diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png deleted file mode 100644 index 232762b82a9ae3e979a1f38a7beb715c87438f40..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/ci_build_whl.png and /dev/null differ diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png deleted file mode 100644 index 0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/compile_run_time.png and /dev/null differ diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle deleted file mode 100644 index 8cc678fea3c820103e7ce81f7a5d625d6c1d92de..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/compiler.graffle and /dev/null differ diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png deleted file mode 100644 index 65d34f841afce9756def07dd8ecb9ca44e658bfe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/compiler.png and /dev/null differ diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png deleted file mode 100644 index 3579998e58d07abc50bd3332128d4733a391cb3b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/control_flow_graph.png and /dev/null differ diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png deleted file mode 100644 index c10f7f69f4007952e5b0394edaa04efa1cfbb658..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/dataflow_equations.png and /dev/null differ diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png deleted file mode 100644 index 15e8e290a111ff43900934341365cb4360d87d28..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/dcgan.png and /dev/null differ diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png deleted file mode 100644 index 026becc4d94e01e407dacb2a5314a0e5723334ff..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/deep_learning.png and /dev/null differ diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle deleted file mode 100644 index 941399c6ced8d5f65b6c595522b770c88259df4b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/dist-graph.graffle and /dev/null differ diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png deleted file mode 100644 index 3546b09f1c2ee3e4f60f519d5e47f823f08051a7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/dist-graph.png and /dev/null differ diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle deleted file mode 100644 index d1b60141342232e06227c2d430ebc60ec349a907..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/distributed_architecture.graffle and /dev/null differ diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png deleted file mode 100644 index 29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/distributed_architecture.png and /dev/null differ diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png deleted file mode 100644 index 1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/ds2_network.png and /dev/null differ diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png deleted file mode 100644 index b29c0d779e3d46b779b5baeabe3176adaeb00a6d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/executor.png and /dev/null differ diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png deleted file mode 100644 index d312371a04c26aa6cd196e0bd1f51becb425180b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/feed_forward.png and /dev/null differ diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png deleted file mode 100644 index 677e99bfd9f8e72ed9fe4b27127af2ced202f447..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/feed_forward_regularized.png and /dev/null differ diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle deleted file mode 100644 index c933df2cb855462c52b2d25f7f9a99b95652961d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/fluid-compiler.graffle and /dev/null differ diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png deleted file mode 100644 index 1b0ffed2039c91a3a00bbb719da08c91c3acf7bb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/fluid-compiler.png and /dev/null differ diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png deleted file mode 100644 index aa99472c0f914cde128fd7b3bd8dc29ac24f94b6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/fluid_examples.png and /dev/null differ diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png deleted file mode 100644 index 554782ba54e43efc3d6babbb94e3cac3530ac649..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/fluid_module_1.png and /dev/null differ diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png deleted file mode 100644 index 4219efccbb1e87839adf6b5720fe46808b7d2fcf..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/fluid_module_2.png and /dev/null differ diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash deleted file mode 100755 index 35e6997abd17588e17a82d448918fc1b3bd7220e..0000000000000000000000000000000000000000 --- a/doc/fluid/images/graph_construction_example.bash +++ /dev/null @@ -1,11 +0,0 @@ -cat ./graph_construction_example.dot | \ - sed 's/color=red/color=red, style=invis/g' | \ - sed 's/color=green/color=green, style=invis/g' | \ - dot -Tpng > graph_construction_example_forward_only.png - -cat ./graph_construction_example.dot | \ - sed 's/color=green/color=green, style=invis/g' | \ - dot -Tpng > graph_construction_example_forward_backward.png - -cat ./graph_construction_example.dot | \ - dot -Tpng > graph_construction_example_all.png diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot deleted file mode 100644 index e115f9844bae6ad24f638c8ed4749cea8aff06a9..0000000000000000000000000000000000000000 --- a/doc/fluid/images/graph_construction_example.dot +++ /dev/null @@ -1,68 +0,0 @@ -digraph ImageClassificationGraph { - ///////// The forward part ///////// - FeedX [label="Feed", color=blue, shape=box]; - FeedY [label="Feed", color=blue, shape=box]; - InitW [label="Init", color=blue, shape=diamond]; - Initb [label="Init", color=blue, shape=diamond]; - FC [label="FC", color=blue, shape=box]; - MSE [label="MSE", color=blue, shape=box]; - - x [label="x", color=blue, shape=oval]; - l [label="l", color=blue, shape=oval]; - y [label="y", color=blue, shape=oval]; - W [label="W", color=blue, shape=doublecircle]; - b [label="b", color=blue, shape=doublecircle]; - cost [label="cost", color=blue, shape=oval]; - - FeedX -> x -> FC -> y -> MSE -> cost [color=blue]; - FeedY -> l [color=blue]; - InitW -> W [color=blue]; - Initb -> b [color=blue]; - W -> FC [color=blue]; - b -> FC [color=blue]; - l -> MSE [color=blue]; - - ////////// The backward part ///////// - MSE_Grad [label="MSE_grad", color=red, shape=box]; - FC_Grad [label="FC_grad", color=red, shape=box]; - - d_cost [label="d cost", color=red, shape=oval]; - d_y [label="d y", color=red, shape=oval]; - d_b [label="d b", color=red, shape=oval]; - d_W [label="d W", color=red, shape=oval]; - - cost -> MSE_Grad [color=red]; - d_cost -> MSE_Grad [color=red]; - l -> MSE_Grad [color=red]; - y -> MSE_Grad -> d_y [color=red]; - - x -> FC_Grad [color=red]; - y -> FC_Grad [color=red]; - d_y -> FC_Grad [color=red]; - W -> FC_Grad -> d_W [color=red]; - b -> FC_Grad -> d_b [color=red]; - - ////////// The optimizaiton part ////////// - - OPT_W [label="SGD", color=green, shape=box]; - OPT_b [label="SGD", color=green, shape=box]; - - W -> OPT_W [color=green]; - b -> OPT_b [color=green]; - d_W -> OPT_W -> W [color=green]; - d_b -> OPT_b -> b [color=green]; - - ////////// Groupings ////////// - - subgraph clusterMSE { - style=invis; - MSE; - MSE_Grad; - } - - subgraph clusterFC { - style=invis; - FC; - FC_Grad; - } -} diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png deleted file mode 100644 index 261611a5721f9aa97874f7e6d897fe48cf667db2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/graph_construction_example_all.png and /dev/null differ diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png deleted file mode 100644 index 4c69687f4a6a181138f3df72ce5e8aa48487b5be..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/graph_construction_example_forward_backward.png and /dev/null differ diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png deleted file mode 100644 index e668c16e0cac73acb4e5dc2b1827557ae77126b4..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/graph_construction_example_forward_only.png and /dev/null differ diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png deleted file mode 100644 index e1b9c7a44f94dc027598a98da93ddb8133190972..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/l1_regularization.png and /dev/null differ diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png deleted file mode 100644 index d5c2fcbc2ccae75ad083162e5a2dceb0210be298..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/l2_regularization.png and /dev/null differ diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png deleted file mode 100644 index e46db4c9c6f5b65ff274b498b716b11de343a8b0..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/layer.png and /dev/null differ diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle deleted file mode 100644 index 19e509bd9af3c1e9a3f5e0f16ddd281457a339c5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/local-graph.graffle and /dev/null differ diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png deleted file mode 100644 index ada51200f793a9bb18911e7d63cfdb3244b967d7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/local-graph.png and /dev/null differ diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle deleted file mode 100644 index 49fcc663ebe3824aa234e3a67aadf285cb417877..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/local_architecture.graffle and /dev/null differ diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png deleted file mode 100644 index 14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/local_architecture.png and /dev/null differ diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png deleted file mode 100644 index 72dfe3547f731d0d090338afb206b0549dff472e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/lookup_table.png and /dev/null differ diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png deleted file mode 100644 index cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/lookup_table_training.png and /dev/null differ diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png deleted file mode 100644 index 14212ec8d36c803de96bde8a9a4b5591bd20434e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/loss_equation.png and /dev/null differ diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle deleted file mode 100644 index e71173715fff92a0a933d0c7d83599ba948552c6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multi-threads.graffle and /dev/null differ diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png deleted file mode 100644 index e40a869987dbbf5019d4cb03c1dab55b74d6c9f9..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multi-threads@3x.png and /dev/null differ diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle deleted file mode 100644 index cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multigpu_allreduce.graffle and /dev/null differ diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png deleted file mode 100644 index 87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multigpu_allreduce.png and /dev/null differ diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle deleted file mode 100644 index 6c35ab1b21fb76ceae82d3693ed0d085b5bc0855..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multigpu_before_convert.graffle and /dev/null differ diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png deleted file mode 100644 index 9c8f7711165d80a2fa3911280fdee91855a401b1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multigpu_before_convert.png and /dev/null differ diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png deleted file mode 100644 index b22126b31db4982c13fc3a0827805e6aaf955046..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/multiple_reader.png and /dev/null differ diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot deleted file mode 100644 index c8ad839cb88788e9b5906402257cc7bbc3ddcb54..0000000000000000000000000000000000000000 --- a/doc/fluid/images/op.dot +++ /dev/null @@ -1,4 +0,0 @@ -digraph sample { - graph [rankdir=TD]; node [shape=record]; - op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"]; -} \ No newline at end of file diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot deleted file mode 100644 index 8f24e9ea83acf879c7008f2d97113c0a4cc111c3..0000000000000000000000000000000000000000 --- a/doc/fluid/images/op_op_with_kern_class_diagram.dot +++ /dev/null @@ -1,38 +0,0 @@ -digraph sample { - graph [rankdir=TD]; node [shape=record]; - op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"]; - op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"] - op_kernel [label="{OpKernel | Compute()=0}"] - op_kernel_key [label="{OpKernelKey| Place place\n...}"] - - op -> op_with_kern [dir=back, arrowtail=onormal] - op_with_kern -> op_kernel [arrowhead=vee, label="contains many"] - - { - rank=same; - op_with_kern - op_kernel - } - - op_kernel -> op_kernel_key [style=invis] - - { - rank=same; - op_kernel - op_kernel_key - } - - op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"] - - mul_op [label="MulOp"] - op_with_kern -> mul_op [dir=back, arrowtail=onormal] - mul_kernel [label="template <typename Place>\lclass MulOpKernel\l"] - op_kernel -> mul_kernel [dir=back, arrowtail=onormal] - mul_op -> mul_kernel [arrowhead=vee, label="register many"] - - { - rank=same; - mul_op; - mul_kernel; - } -} \ No newline at end of file diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot deleted file mode 100644 index 4f5af4f7b5f5a69693a058c99eb658900136077a..0000000000000000000000000000000000000000 --- a/doc/fluid/images/op_with_kernel.dot +++ /dev/null @@ -1,26 +0,0 @@ -digraph sample { - graph [rankdir=TD]; node [shape=record]; - op [label="{Operator}"]; - op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"] - op_kernel [label="{OpKernel | Compute()=0}"] - op_kernel_key [label="{OpKernelKey| Place place\n...}"] - - op -> op_with_kern [dir=back, arrowtail=onormal] - op_with_kern -> op_kernel [arrowhead=vee, label="contains many"] - - { - rank=same; - op_with_kern - op_kernel - } - - op_kernel -> op_kernel_key [style=invis] - - { - rank=same; - op_kernel - op_kernel_key - } - - op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"] -} \ No newline at end of file diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png deleted file mode 100644 index 3975b06f615b7a88dfc11e71b6451fdf4ce42d60..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/operator1.png and /dev/null differ diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png deleted file mode 100644 index b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/operator2.png and /dev/null differ diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle deleted file mode 100644 index a6348cc3dbcaca923c6e794681b2edb85cb9f8f6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/paddle-compile.graffle and /dev/null differ diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png deleted file mode 100644 index e0f13d551ac41afaec627a57dea79356464bf0bf..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/paddle-compile.png and /dev/null differ diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png deleted file mode 100644 index 14e77511d639af155e5a3725cde05323e0cc94f2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/place.png and /dev/null differ diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png deleted file mode 100644 index 8e9edbf377672d0ef40f2fc7bd39e746923550cb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/pprof_1.png and /dev/null differ diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png deleted file mode 100644 index 172ba20399ba974d27f4c072425277b69b02520b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/pprof_2.png and /dev/null differ diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png deleted file mode 100644 index e8e459e1b3d5c8706b3caa05dc371db8d46df4a5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/print_fluid_program.png and /dev/null differ diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png deleted file mode 100644 index d57b71ca88aaba5d05584a6219d84214e285a1e1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/profiler.png and /dev/null differ diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png deleted file mode 100644 index 0656336914ece957f2e5bb4d70ad337a63e31d88..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/program_desc1.png and /dev/null differ diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png deleted file mode 100644 index db5bfa1231345add8661b4f8ef0fc9d861f40d24..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/program_desc2.png and /dev/null differ diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png deleted file mode 100644 index 0725f92d2b169c2b59ec7c68b402859c2a2dd1d8..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/raw_input.png and /dev/null differ diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png deleted file mode 100644 index fd59168ce16c9e2a0ef45303c28c997cfd7740be..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/readers.png and /dev/null differ diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle deleted file mode 100644 index 41b2067311694b56d211a4f32d1b76884eeffd2d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/remote_executor.graffle and /dev/null differ diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png deleted file mode 100644 index 744e2fb2e0f1bbe058e991ba7b2a09000965ee79..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/remote_executor.png and /dev/null differ diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot deleted file mode 100644 index c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5..0000000000000000000000000000000000000000 --- a/doc/fluid/images/rnn.dot +++ /dev/null @@ -1,87 +0,0 @@ -digraph G { - label = "simple RNN implementation" - - ranksep=2; - - //graph [nodesep=1, ranksep=1]; - - node[nodesep=1] - - subgraph cluster0 { - label = "global scope" - rankdir = TB - W - boot_memory - input - output - } - - subgraph cluster1 { - label = "step-scope 0" - rankdir = TB - memory0[label="memory"] - prememory0[label="pre-memory"] - step_input0[label="step input"] - step_output0[label="step output"] - } - - subgraph cluster2 { - label = "step-scope 1" - rankdir = TB - memory1[label="memory"] - prememory1[label="pre-memory"] - step_input1[label="step input"] - step_output1[label="step output"] - } - - subgraph cluster3 { - label = "step-scope 2" - rankdir = TB - memory2[label="memory"] - prememory2[label="pre-memory"] - step_input2[label="step input"] - step_output2[label="step output"] - } - - stepnet [shape=box] - stepnet0 [shape=box, style=dashed] - stepnet1 [shape=box, style=dashed] - stepnet2 [shape=box, style=dashed] - - - edge[color=blue] - boot_memory -> prememory0 [label="init" color="blue"] - memory0 -> prememory1 [label="copy/reference" color="blue"] - memory1 -> prememory2 [label="copy/reference" color="blue"] - - edge[color=black] - W -> stepnet0[constraint=false, style=dashed] - W -> stepnet1[constraint=false, style=dashed] - W -> stepnet2[constraint=false, style=dashed] - - memory0 -> stepnet0[style=dashed] - prememory0 -> stepnet0 -> step_output0[style=dashed] - - memory1 -> stepnet1[style=dashed] - prememory1 -> stepnet1 -> step_output1[style=dashed] - - memory2 -> stepnet2[style=dashed] - prememory2 -> stepnet2 -> step_output2[style=dashed] - - input -> step_input0 - input -> step_input1 - input -> step_input2 - - step_input0 -> stepnet0 [style=dashed] - step_input1 -> stepnet1[style=dashed] - step_input2 -> stepnet2[style=dashed] - - step_output0 -> output - step_output1 -> output - step_output2 -> output - - stepnet0 -> stepnet[style=dashed] - stepnet1 -> stepnet[style=dashed] - stepnet2 -> stepnet[style=dashed] - -} diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg deleted file mode 100644 index 9867e404cf959df0dce6ded5222b466c788fb840..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/rnn.jpg and /dev/null differ diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png deleted file mode 100644 index e139e373fe8396782044cfd936fdde624f8c66fe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/rnn.png and /dev/null differ diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot deleted file mode 100644 index 1d85ae2617a915ad0ad8288d848b607cc37ad297..0000000000000000000000000000000000000000 --- a/doc/fluid/images/rnn_2level_data.dot +++ /dev/null @@ -1,75 +0,0 @@ -digraph G { - chapter [label="chapter"] - - subgraph cluster0 { - label = "paragraph 0" - - top_rnn0[label="top rnn step 0" shape=box] - - p0 [label="paragraph 0"] - p1 [label="paragraph 1"] - } - - subgraph cluster1{ - label = "paragraph 1" - - top_rnn1[label="top rnn step 1" shape=box] - - p2 [label="paragraph 0"] - p3 [label="paragraph 1"] - } - - subgraph cluster_p0 { - label = "sentence 0" - - low_rnn0 [label="low rnn step 0" shape=box] - s00 [label="sentence 0"] - s01 [label="sentence 1"] - - low_rnn0 -> s00 - low_rnn0 -> s01 - } - - subgraph cluster_p1 { - label = "sentence 1" - low_rnn1 [label="low rnn step 1" shape=box] - s10 [label="sentence 0"] - s11 [label="sentence 1"] - low_rnn1 -> s10 - low_rnn1 -> s11 - } - - subgraph cluster_p2 { - label = "sentence 1" - low_rnn2 [label="low rnn step 0" shape=box] - s20 [label="sentence 0"] - s21 [label="sentence 1"] - low_rnn2 -> s20 - low_rnn2 -> s21 - } - - subgraph cluster_p3 { - label = "sentence 1" - low_rnn3 [label="low rnn step 1" shape=box] - s30 [label="sentence 0"] - s31 [label="sentence 1"] - low_rnn3 -> s30 - low_rnn3 -> s31 - } - - - chapter -> top_rnn0 - chapter -> top_rnn1 - - top_rnn0 -> p0 - top_rnn0 -> p1 - top_rnn1 -> p2 - top_rnn1 -> p3 - - - p0 -> low_rnn0 - p1 -> low_rnn1 - p2 -> low_rnn2 - p3 -> low_rnn3 - -} diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png deleted file mode 100644 index 4be81b2430717a6a506342a09fc26899568574c6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/rnn_2level_data.png and /dev/null differ diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png deleted file mode 100644 index 59b0de6fb36f9f6b469227c05760a7612bb30b4d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/scope_variable_tensor.png and /dev/null differ diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png deleted file mode 100644 index 4083aebfdd45af5fbac25fa2c4176bc08c3cb44a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/single-thread@3x.png and /dev/null differ diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png deleted file mode 100644 index ff601128368ee179e3fd33e5e295a9ddd3dcbaeb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/sorted_input.png and /dev/null differ diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle deleted file mode 100644 index 08d689a58f83698d8c1158ee3990ed8abf3a7a9a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/sparse_update.graffle and /dev/null differ diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png deleted file mode 100644 index 8c872e6ac479f7d1b818a4a207956c43155d0ad7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/sparse_update.png and /dev/null differ diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot deleted file mode 100644 index 62c69b8fc8010a26a54a6ee8ef1488aad94d747a..0000000000000000000000000000000000000000 --- a/doc/fluid/images/test.dot +++ /dev/null @@ -1,35 +0,0 @@ - -digraph Test { - z -> generator -> G_img; - G_img -> discriminator -> D_f -> d_loss_f; - label0 -> d_loss_f -> d_loss; - - img -> discriminator -> D_t -> d_loss_t; - label1 -> d_loss_t -> d_loss; - - d_loss -> d_loss_t[color=red, style=dashed]; - d_loss -> d_loss_f[color=red, style=dashed]; - d_loss_t -> D_t[color=red, style=dashed]; - d_loss_f -> D_f[color=red, style=dashed]; - D_t -> discriminator[color=red, style=dashed]; - D_f -> discriminator[color=red, style=dashed]; - - D_f -> g_loss; - label2 -> g_loss; - - g_loss -> D_f[color=green, style=dashed]; - D_f -> discriminator[color=green, style=dashed]; - discriminator -> G_img[color=green, style=dashed]; - G_img -> generator[color=green, style=dashed]; - - discriminator [color=red, shape=box]; - generator [color=green, shape=box]; - z [shape=diamond]; - img [shape=diamond]; - label0 [shape=diamond]; - label1 [shape=diamond]; - label2 [shape=diamond]; - - d_loss [color=red]; - g_loss [color=green]; -} diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png deleted file mode 100644 index 4e121a40b9f7b2232d7cdda315bad15926446f55..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/test.dot.png and /dev/null differ diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif deleted file mode 100644 index dd24d33e124396be3fc410c9b12f33148f64efe2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/theta_star.gif and /dev/null differ diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg deleted file mode 100644 index 38ec3f80c982857531f30a8bb0fa26ea5bf05385..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/timeline.jpeg and /dev/null differ diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg deleted file mode 100644 index 3a49fc4f8a401a9463b0157e2f38c164ca02dcc5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/tracing.jpeg and /dev/null differ diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png deleted file mode 100644 index 422973c0dc7aa2b544d2fc86a97ace706388cb9e..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/transpiler.png and /dev/null differ diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png deleted file mode 100644 index ffc94e3d8945ec6291460afd90e8fcc600828390..0000000000000000000000000000000000000000 Binary files a/doc/fluid/images/user_interface.png and /dev/null differ diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst deleted file mode 100644 index 6b1ef3ceed4f7ed5073d42c13ce103e2ab467e58..0000000000000000000000000000000000000000 --- a/doc/fluid/index_cn.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. PaddlePaddle Fluid documentation master file, created by - sphinx-quickstart on Thu Jun 7 17:04:53 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -############## -欢迎使用 Fluid -############## - -.. toctree:: - :maxdepth: 1 - - new_docs/beginners_guide/index.rst - new_docs/user_guides/index.rst - new_docs/advanced_usage/index.rst - new_docs/faq/index_cn.rst diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst deleted file mode 100644 index 2bc76b58982cf50e637d15cca0c5d78166aa73a9..0000000000000000000000000000000000000000 --- a/doc/fluid/index_en.rst +++ /dev/null @@ -1,12 +0,0 @@ - PaddlePaddle Fluid -========================== - -.. toctree:: - :maxdepth: 1 - - getstarted/index_en.rst - build_and_install/index_en.rst - design/index_en.rst - howto/index_en.rst - dev/index_en.rst - faq/index_en.rst diff --git a/doc/fluid/new_docs/advanced_usage/benchmark.rst b/doc/fluid/new_docs/advanced_usage/benchmark.rst deleted file mode 100644 index 7854263bf8f64c840492550fb22152582c7d2361..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/benchmark.rst +++ /dev/null @@ -1,120 +0,0 @@ -################# -如何进行基准测试 -################# - -本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面,下文包含搭建测试环境,选择基准测试模型,验证测试结果等几方面内容。 - -验证深度学习框架,可分为训练和测试两个阶段, 验证指标略有不同,本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度,训练集是完备的,因此关注大batch\_size下的训练速度,关注吞吐量,例如图像模型常用的batch\_size=128, 多卡情况下会加大;预测阶段关注的是在测试集上的精度,线上服务测试数据不能提前收集,因此关注小batch\_size下的预测速度,关注延迟,例如预测服务常用的batch\_size=1, 4等。 - -`Fluid `__ 是PaddlePaddle从0.11.0版本开始引入的设计,本文的基准测试在该版本上完成。 - - -环境搭建 -"""""""""""" - -基准测试中模型精度和硬件、框架无关,由模型结构和数据共同决定;性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异,控制硬件环境,系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行. - - -不同架构的GPU卡性能差异巨大,在验证模型在GPU上训练性能时,可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号,如果测试多卡训练性能,需确认硬件连接是 `nvlink `__ 或 `PCIe `__ 。 同样地,CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数,确认当前正在使用的CPU型号。 - -下载GPU对应的Cuda Tool Kit和 Cudnn,或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker `__, 镜像内包含了Cuda和Cudnn,本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库,影响在此基础上编译出的Fluid二进制运行性能。 - -准备好Cuda环境后,从github上的下载Paddle并源码编译,会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch `__\ 。另外,cudnn对卷积类任务影响巨大,在基准测试中需要小版本一致,例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。 - - -选择基准模型 -"""""""""""" - -对框架做基准测试,需要覆盖不同训练任务和不同大小的模型,本文中选取了图像和NLP的最为常用的5个模型。 - -============ ============ ================= ============ -任务种类 模型名称 网络结构 数据集 -============ ============ ================= ============ -图像分类 mnist Lenet mnist -图像分类 VGG VGG-16 Flowers102 -图像分类 Resnet Resnet-50 Flowers102 -文本分类 Stacked-LSTM Stacked-LSTM IMDB -机器翻译 seq-seq Stacked-LSTM wmt14 -============ ============ ================= ============ - -其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。 -`benchmark `__ -基准模型测试脚本中,均跳过了前几个batch的训练过程,原因是加载数据和分配显存受系统当前运行情况影响,会导致统计性能不准确。运行完若干个轮次后,统计对应指标。 - - -基准模型的数据的选择方面,数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 `__ ,图像大小预处理为和Imagenet相同大小,因此性能可直接对比 -NLP模型的公开且影响力大数据集较少,seq2seq模型选择了wmt14数据,stacked-lstm模型中选择了 `imdb `__ 数据。 - - -注意,图像模型每条样本大小相同,图像经过变换后大小一致,因此经过的计算路径基本相同,计算速度和显存占用波动较小,可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定,计算路径和显存占用也不相同,因此只能完整运行若干个轮次后,统计速度和显存消耗。 -显存分配是特别耗时的操作,因此Fluid默认会占用所有可用显存空间形成显存池,用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗,可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`,观察最大显存开销。 - - -测试过程 -"""""""""""" - -- CPU 单机单线程测试 - -测试CPU上单线程的性能,先设置CUDA的环境变量为空,``CUDA_VISIBLE_DEVICES=``,并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``, ``MKL_NUM_THREADS=1;``。 -然后代码中设置为使用CPUPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=False即可。 - -.. code-block:: python - - >>> import paddle.fluid as fluid - >>> place = fluid.CPUPlace() - -.. code:: bash - - docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash - - -- GPU 单机单卡测试 - -本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04` - -.. code:: bash - - nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash -在单卡上测试,设置CUDA的环境变量使用一块GPU,``CUDA_VISIBLE_DEVICES=0`` -然后代码中设置为使用CUDAPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=True即可。 - -.. code-block:: python - - >>> import paddle.fluid as fluid - >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU - - -测试结果 -"""""""""""" - -本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。 -硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。 -系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境,系统版本为nvidia-docker17.05.0-ce。 -测试的Fluid版本为\ `v.0.12.0 `__ 。 -TensorFlow版本为\ `v.1.4.0-rc1 `__ 。 -使用的脚本和配置见\ `benchmark `__ 。 -图表中统计单位为samples/秒。 - -- CPU 单机单线程测试结果 - - ================ ==================== =================== - Speed Fluid CPU TensorFlow CPU - ================ ==================== =================== - mnist 1298.75 samples/s 637.57 samples/s - VGG-16 0.4147 images/s 0.1229 images/s - Resnet-50 1.6935 images/s 0.3657 images/s - Stacked-LSTM 472.3225 words/s 48.2293words/s - Seq2Seq 217.1655 words/s 28.6164 words/s - ================ ==================== =================== - -- GPU 单机单卡测试结果 - - =============== ===================== ================= - Speed Fluid GPU TensorFlow GPU - =============== ===================== ================= - mnist 19710.90 samples/s 15576.3 samples/s - VGG-16 59.83327 images/s 40.9967 images/s - Resnet-50 105.84412 97.8923 images/s - Stacked-LSTM 1319.99315 1608.2526 words/s - Seq2Seq 7147.89081 6845.1161 words/s - =============== ===================== ================= diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md deleted file mode 100644 index 08ea379f81d16407ed5f82770b55a34bcf138da8..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md +++ /dev/null @@ -1,56 +0,0 @@ -# Anakin ARM 性能测试 - -## 测试环境和参数: -+ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd -+ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp -+ 测试平台 - - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz - - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz - - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz -+ 多线程:openmp -+ 时间:warmup10次,运行10次取均值 -+ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本 -+ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本 - -在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析 - -## BenchMark model - -> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model -> 对这些model,本文在ARM上进行多线程的单batch size测试。 - -- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载* -- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载* -- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载* - -### mobilenetv1 - - |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| - |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| - |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan| - |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan| - |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan| - -### mobilenetv2 - - |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| - |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| - |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan| - |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan| - |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan| - -### mobilenet-ssd - - |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)| - |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| - |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan| - |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan| - |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan - -## How to run those Benchmark models? - -1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换 -2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机 -3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令 -4. 最后,终端显示器上将会打印该模型的运行时间 -5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到 diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md deleted file mode 100644 index e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md +++ /dev/null @@ -1,28 +0,0 @@ -# Example -Anakin目前只支持NCHW的格式 -示例文件在test/framework/net下 - -## 在NV的GPU上运行CNN模型 -示例文件为打开example_nv_cnn_net.cpp,整体流程如下: -- 将模型的的path设置为anakin模型的路径,初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到 -- 根据模型设置网络图的输入尺寸,进行图优化 -- 根据优化后的网络图初始化网络执行器 -- 取出网络的输入tensor,将数据拷贝到输入tensor -- 运行推导 -- 取出网络的输出tensor - -以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关 - -## 在X86上运行RNN模型 -示例文件为example_x86_rnn_net.cpp -整体流程与在NV的GPU上运行CNN模型相似,不同之处如下: -- 使用X86标识初始化图对象和网络执行器对象 -- rnn模型的输入尺寸是可变的,初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词,其中第0到第4个词是第一句话,第5到第11个词是第二句话 - -以X86平台为例演示Anakin框架的使用方法,注意编译时需要打开X86编译开关 - -## 在NV的GPU上使用Anakin的线程池运行CNN模型 -示例文件为example_nv_cnn_net_multi_thread.cpp ,示例使用worker的同步预测接口 -整体流程与在NV的GPU上运行CNN模型相似,不同之处如下: -- 用模型地址和线程池大小初始化worker对象 -- 将输入tensor注入任务队列,获得输出tensor diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md deleted file mode 100644 index 667f9396f1169a0d891b9e6b0e912aa5527ab0b8..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md +++ /dev/null @@ -1,170 +0,0 @@ -# Anakin GPU Benchmark - -## Machine: - -> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz` -> GPU: `Tesla P4` -> cuDNN: `v7` - - -## Counterpart of anakin : - -The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support. - -## Benchmark Model - -The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`. - You can use pretrained caffe model or the model trained by youself. - -> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md) - - -- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)* -- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)* -- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* -- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)* -- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* -- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)* -- [RNN](#7) *not support yet* - -We tested them on single-GPU with single-thread. - -### VGG16 - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 8.8690 | 8.2815 | -| 2 | 15.5344 | 13.9116 | -| 4 | 26.6000 | 21.8747 | -| 8 | 49.8279 | 40.4076 | -| 32 | 188.6270 | 163.7660 | - -- GPU Memory Used (`MB`) - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 963 | 997 | -| 2 | 965 | 1039 | -| 4 | 991 | 1115 | -| 8 | 1067 | 1269 | -| 32 | 1715 | 2193 | - - -### Yolo - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 16.4596| 15.2124 | -| 2 | 26.6347| 25.0442 | -| 4 | 43.3695| 43.5017 | -| 8 | 80.9139 | 80.9880 | -| 32 | 293.8080| 310.8810 | - -- GPU Memory Used (`MB`) - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 1569 | 1775 | -| 2 | 1649 | 1815 | -| 4 | 1709 | 1887 | -| 8 | 1731 | 2031 | -| 32 | 2253 | 2907 | - -### Resnet50 - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 4.2459 | 4.1061 | -| 2 | 6.2627 | 6.5159 | -| 4 | 10.1277 | 11.3327 | -| 8 | 17.8209 | 20.6680 | -| 32 | 65.8582 | 77.8858 | - -- GPU Memory Used (`MB`) - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 531 | 503 | -| 2 | 543 | 517 | -| 4 | 583 | 541 | -| 8 | 611 | 589 | -| 32 | 809 | 879 | - -### Resnet101 - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 7.5562 | 7.0837 | -| 2 | 11.6023 | 11.4079 | -| 4 | 18.3650 | 20.0493 | -| 8 | 32.7632 | 36.0648 | -| 32 | 123.2550 | 135.4880 | - -- GPU Memory Used (`MB)` - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 701 | 683 | -| 2 | 713 | 697 | -| 4 | 793 | 721 | -| 8 | 819 | 769 | -| 32 | 1043 | 1059 | - -### MobileNet V1 - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 45.5156 | 1.3947 | -| 2 | 46.5585 | 2.5483 | -| 4 | 48.4242 | 4.3404 | -| 8 | 52.7957 | 8.1513 | -| 32 | 83.2519 | 31.3178 | - -- GPU Memory Used (`MB`) - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 329 | 283 | -| 2 | 345 | 289 | -| 4 | 371 | 299 | -| 8 | 393 | 319 | -| 32 | 531 | 433 | - -### MobileNet V2 - -- Latency (`ms`) of different batch - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 65.6861 | 2.9842 | -| 2 | 66.6814 | 4.7472 | -| 4 | 69.7114 | 7.4163 | -| 8 | 76.1092 | 12.8779 | -| 32 | 124.9810 | 47.2142 | - -- GPU Memory Used (`MB`) - -| BatchSize | TensorRT | Anakin | -| --- | --- | --- | -| 1 | 341 | 293 | -| 2 | 353 | 301 | -| 4 | 385 | 319 | -| 8 | 421 | 351 | -| 32 | 637 | 551 | - -## How to run those Benchmark models? - -> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md). -> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file. -> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen. -> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file. diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md deleted file mode 100644 index 5efbc89abd469871b318c306e8cb03dd95f0c85b..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md +++ /dev/null @@ -1,639 +0,0 @@ -# Anakin 使用教程 ## - -本教程将会简略的介绍Anakin的工作原理,一些基本的Anakin API,以及如何调用这些API。 - -## 内容 ### - -- [Anakin的工作原理](#principle) -- [Anakin APIs](#api) -- [示例代码](#example) - -## Anakin的工作原理 ### - -![Anakin_principle](../pics/anakin_fm_ch.png) - -用Anakin来进行前向计算主要分为三个步骤: - -- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型 - 在使用Anakin之前,用户必须将所有其他模型转换成Anakin模型,我们提供了转换脚本,用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。 -- 生成Anakin计算图 - 加载Anakin模型生成原始计算图,然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。 -- 执行计算图 - Anakin会选择不同硬件平台执行计算图。 - - -## Anakin APIs ### -### Tensor #### - -`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性: - -- Buffer - 数据存储区 -- Shape - 数据的维度信息 -- Event - 用于异步计算的同步 - - `Tensor` 类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息,`_valid_shape`表示当前`tensor`使用的空间信息, `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。 - - -Dimentions | Math entity | - :----: | :----: -1 | vector -2 | matrix -3 | 3-tensor -n | n-tensor - -#### 声明tensor对象 - -`Tensor`接受三个模板参数: - - -```c++ - template - class Tensor .../* Inherit other class */{ - //some implements - ... - }; -``` - -TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应;[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下: - -1. TargetType - - Anakin TargetType | platform - :----: | :----:| - NV | NVIDIA GPU - ARM | ARM - AMD | AMD GPU - X86 | X86 - NVHX86 | NVIDIA GPU with Pinned Memory - -2. DataType - -Anakin DataType | C++ | Description -:---: | :---: | :---: | -AK_HALF | short | fp16 -AK_FLOAT | float | fp32 -AK_DOUBLE | double | fp64 -AK_INT8 | char | int8 -AK_INT16 | short | int16 -AK_INT32 | int | int32 -AK_INT64 | long | int64 -AK_UINT8 | unsigned char | uint8 -AK_UINT16 | unsigned short | uint8 -AK_UINT32 | unsigned int | uint32 -AK_STRING | std::string | / -AK_BOOL | bool | / -AK_SHAPE | / | Anakin Shape -AK_TENSOR | / | Anakin Tensor - - -3. LayOutType - -Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support -:---: | :---: | :---: | :---: | -W | 1-D | YES | NO -HW | 2-D | YES | NO -WH | 2-D | YES | NO -NW | 2-D | YES | YES -NHW | 3-D | YES |YES -NCHW ( default ) | 4-D | YES | YES -NHWC | 4-D | YES | NO -NCHW_C4 | 5-D | YES | YES - - -理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOutType,NCHW_C4是专门针对于int8这种数据类型的。 - - -例子 - -> 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。 - -> 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h* - -> 1. 使用shape对象初始化tensor -``` c++ - //create a null tensor. A null tensor holds for nothing. - //tensor's buffer is resident at CPU and its datatype is AK_FLOAT. - //tensor's Layout is NCHW(default) - Tensor mytensor; - - //1. using shape object to create a tensor. - Shape shape1(NUM); //1-D shape. NUM is the number of dimention. - Tensor mytensor1(shape1); //1-D tensor. - - // A 4-D shape - Shape shape2(N, C, H, W); // batch x channel x height x width -``` - ->`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示` - - -```c++ - // A 4-D tensor. - Tensor mytensor2(shape2); //right - - //A 4-D tensor which is resident at GPU and its datatype is AK_INT8 - Tensor mytensor3(shape2); //right - - Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout. - Tensor mytensor5(shape2); //wrong!!!! - -``` - -> 2. 使用现有的数据和shape初始化tensor - -```c++ - - /** - * A construtor of Tensor. - * data_ptr is a pointer to any data type of data - * TargetType is type of a platform [Anakin TargetType] - * id : device id - * shape: a Anakin shape - */ - Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape); - - //using existing data feed to a tensor - Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W). - -``` - -> 3. 使用tensor初始化tensor - -```c++ - Tensor tensor(exist_tensor); -``` - - -> 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor - - -#### 填充tensor数据区 - - -填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。 - -```c++ -首先来看看tensor的四种声明方式: - -1. Tensor mytensor; -2. Tensor mytensor1(shape1); -3. Tensor mytensor(data_ptr, TargetType, device_id, shape); -4. Tensor tensor(exist_tensor); - - -相关的声明方式的数据填充方法如下: - -1:声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。 - - //parama shape - mytensor.re_alloc(Shape shape); - - //Get writable pointer to mytensor. - //parama index (int): where you start to write. - //Dtype is your data type such int, float or double. - Dtype *p = mytensor.mutable_data(index/*=0*/); - //write data to mytensor - for(int i = 0; i < mytensor.size(); i++){ - p[i] = 1.0f; - } - //do something ... - -2: 这种声明方式会自动分配内存 - - //Get writable pointer to mytensor. - //parama index (int): where you start to write. - //Dtype is your data type such int, float or double. - Dtype *p = mytensor1.mutable_data(index/*=0*/); - //write data to mytensor - for(int i = 0; i < mytensor.size(); i++){ - p[i] = 1.0f; - } - //do something ... - - -3:在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的 -tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而 -tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。 - - //Get writable pointer to mytensor. - //parama index (int): where you start to write. - //Dtype is your data type such int, float or double. - Dtype *p = mytensor.mutable_data(index/*=0*/); - //write data to mytensor - for(int i = 0; i < mytensor.size(); i++){ - p[i] = 1.0f; - } - //do something ... - -4:该种方式仍不需要手动分配内存 - - //Get writable pointer to mytensor. - //parama index (int): where you start to write. - //Dtype is your data type such int, float or double. - Dtype *p = mytensor.mutable_data(index/*=0*/); - //write data to mytensor - for(int i = 0; i < mytensor.size(); i++){ - p[i] = 1.0f; - } - //do something ... - - -另外,你还可以获取一个tensor的可读指针,示例如下: - //Get read-only pointer to mytensor. - //parama index (int): where you start to read. - //Dtype is your data type such int, float or double. - Dtype *p = mytensor.data(index/*=0*/); - //do something ... -``` - -如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h* - -#### 获取tensor的shape - -```c++ -//some declarations -// ... -Shape shape = mytensor.shape(); - -//Get a first dimetion size of tesor, if it has. -int d1 = shape[0]; - -//Get a second dimention size of tensor, if it has. -int d2 = shape[1]; - -... - -//Get a n-th dimention size of tensor, if it has. -int dn = shape[n-1]; - - -//Get a tensor's dimention -int dims = mytensor.dims(); - -//Get the size of tensor. -//size = d1 x d2 x ... x dn. -int size = mytensor.size(); - -//Get the size of tensor at interval [Di, Dj) -// form i-th dimention to j-th dimention, but not including the j-th dimention. -// which means di x (di+1) x ... x (dj -1) -int size = mytensor.count(start, end); -``` - -#### 设置tensor的shape - -我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义 - - -```c++ -/** - * \brief set a tensor's shape - * \param valid_shape [a Shape object] - * \param shape [a Shape object] - * \param offset [a Shape object] - * \return the status of this operation, that means whether it success * or not. - */ -SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); -``` - -这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。 - -```c++ - -// some declarations -// ... -//valid_shape, shape , offset are Shape object; -//All these Shape object's LayOutType must be equal to mytensor's. -mytensor.set_shape(valid_shape, shape, offset); - -``` - -#### 重置 tensor的shape - -```c++ -//some declarations -Shape shape, valid_shape, offset; - -//do some initializations -... -mytensor.reshape(valid_shape, shape, offset); -``` - -注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同 - - -### Graph ### - -`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。 - -#### 图的声明 - -与`Tensor`一样,graph也接受三个模板参数。 - -```c++ - -template -class Graph ... /* inherit other class*/{ - - //some implements - ... - -}; -``` - -前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。 - - -```c++ - -//Create a empty graph object. -Graph graph = Graph tmp(); - -//Create a pointer to a empty graph. -Graph *graph = new Graph(); - -//Create a pointer to a empty graph. -auto graph = new Graph(); - -``` - -#### 加载 Anakin 模型 - -```c++ -//some declarations -... -auto graph = new Graph(); -std::string model_path = "the/path/to/where/your/models/are"; -const char *model_path1 = "the/path/to/where/your/models/are"; - -//Loading Anakin model to generate a compute graph. -auto status = graph->load(model_path); - -//Or this way. -auto status = graph->load(model_path1); -//Check whether load operation success. -if(!status){ - std::cout << "error" << endl; - //do something... -} - -``` - -#### 优化计算图 - -```c++ -//some declarations -... -//Load graph. -... -//According to the ops of loaded graph, optimize compute graph. -graph->Optimize(); - -``` - -> 注意: 第一次加载原始图,必须要优化。 - -#### 保存模型 - -你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。 - - -```c++ -//some declarations -... -//Load graph. -... -// save a model -//save_model_path: the path to where your model is. -auto status = graph->save(save_model_path); - -//Checking -if(!status){ - cout << "error" << endl; - //do somethin... -} -``` - -#### 重新设置计算图里的tensor的shape - -```c++ -//some declarations -... -//Load graph. -... -vector shape{10, 256, 256, 10}; -//input_name : std::string. -//Reshape a tensor named input_name. -graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object. -``` - -#### 设置 batch size - -`Graph` 支持重新设置batch size的大小。 - -```c++ -//some declarations -... -//Load graph. -... -//input_name : std::string. -//Reset a tensor named input_name. -int new_batch_size = 4; -graph->ResetBatchSize(input_name, new_batch_size); -``` - -### Net ### - - -`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出 -#### Creating a graph executor - -`Net`接受四个模板参数。 - - -```c++ -template -class Net{ - //some implements - ... - -}; -``` -由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*. - - -1. Precision - -Precision | Op support -:---: | :---: -Precision::INT4 | NO -Precision::INT8 | NO -Precision::FP16 | NO -Precision::FP32 | YES -Precision::FP64 | NO - -现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision. - - - -2. OpRunType - -OpRunType | Sync/Aync |Description -:---: | :---: | :---: -OpRunType::SYNC | Synchronization | single-stream on GPU -OpRunType::ASYNC | Asynchronization | multi-stream on GPU - -用graph对象创建一个执行器。 -```c++ -//some declarations -... -//Create a pointer to a graph. -auto graph = new Graph(); -//do something... -... - -//create a executor -Net executor(*graph); - -``` - -#### 获取输入输出tensor - - -获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码 - -```c++ -//some declaratinos -... - -//create a executor -//TargetType is NV [NVIDIA GPU] -Net executor(*graph); - -//Get the first input tensor. -//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU. -//Note: Member function get_in returns an pointer to tensor. -Tensor* tensor_in0 = executor.get_in("input_0"); - -//If you have multiple input tensors -//You just type this code below. -Tensor* tensor_in1 = executor.get_in("input_1"); -... -auto tensor_inn = executor.get_in("input_n"); -``` - -当得到输入tensor之后,就可以填充它的数据区了。 - -```c++ -//This tensor is resident at GPU. -auto tensor_d_in = executor.get_in("input_0"); - -//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one. - -//using Tensor4d = Tensor; -Tensor4d tensor_h_in; //host tensor; -//Tensor tensor_h_in; - -//Allocate memory for host tensor. -tensor_h_in.re_alloc(tensor_d_in->valid_shape()); -//Get a writable pointer to tensor. -float *h_data = tensor_h_in.mutable_data(); - -//Feed your tensor. -/** example -for(int i = 0; i < tensor_h_in.size(); i++){ - h_data[i] = 1.0f; -} -*/ -//Copy host tensor's data to device tensor. -tensor_d_in->copy_from(tensor_h_in); - -// And then -``` - - -类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor: -```c++ -//Note: this tensor are resident at GPU. -Tensor* tensor_out_d = executor.get_out("pred_out"); - -``` - - -#### Executing graph - - -当一切准备就绪后,我们就可以执行真正的计算了! -```c++ -executor.prediction(); -``` - -## 示例代码 ## - -下面的例子展示了如何调用Anakin。 - -在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。 - -### Single-thread - -单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`* - -```c++ - -std::string model_path = "your_Anakin_models/xxxxx.anakin.bin"; -// Create an empty graph object. -auto graph = new Graph(); -// Load Anakin model. -auto status = graph->load(model_path); -if(!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); -} -// Reshape -graph->Reshape("input_0", {10, 384, 960, 10}); -// You must optimize graph for the first time. -graph->Optimize(); -// Create a executer. -Net net_executer(*graph); - -//Get your input tensors through some specific string such as "input_0", "input_1", and -//so on. -//And then, feed the input tensor. -//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out. -auto d_tensor_in_p = net_executer.get_in("input_0"); -Tensor4d h_tensor_in; -auto valid_shape_in = d_tensor_in_p->valid_shape(); -for (int i=0; icopy_from(h_tensor_in); - -//Do inference. -net_executer.prediction(); - -//Get result tensor through the name of output node. -//And also, you need to see the dash board again to find out how many output nodes are and remember their name. - -//For example, you've got a output node named obj_pre_out -//Then, you can get an output tensor. -auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor. -auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor. -//...... -// do something else ... -//... -//save model. -//You might not optimize the graph when you load the saved model again. -std::string save_model_path = model_path + std::string(".saved"); -auto status = graph->save(save_model_path); -if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); -} - -``` diff --git a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md deleted file mode 100644 index 56ca582b2b47f404ede777712830731ea7f4e9b5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md +++ /dev/null @@ -1,73 +0,0 @@ -# 模型转换指南 - -Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型。本文档介绍如何转换模型。 - -## 简介 - -Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。 - -模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。 - -您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。 - - -## 系统要求 - -- python 2.7+ -- pyyaml -- flask -- protobuf 3.5+ - - -## 用法 - -### 1、环境 -转换器所需的依赖标注于 *系统要求* 一节。 - -### 2、配置 -您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。 - -#### config.yaml -```bash -OPTIONS: - Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID - SavePath: ./output # 转换结束后模型的保存位置 - ResultName: googlenet # 输出模型的名字 - Config: - LaunchBoard: ON # 是否生成网络结构预览页面 - Server: - ip: 0.0.0.0 - port: 8888 # 从一个可用端口访问预览页面 - OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项 - enable: OFF - path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved - LOGGER: - LogToPath: ./log/ # 生成日志的路径 - WithColor: ON - -TARGET: - CAFFE: - # 当 Framework 为 CAFFE 时需填写 - ProtoPaths: - - /path/to/caffe/src/caffe/proto/caffe.proto - PrototxtPath: /path/to/your/googlenet.prototxt - ModelPath: /path/to/your/googlenet.caffemodel - - FLUID: - # 当 Framework 为 FLUID 时需填写 - Debug: NULL - ProtoPaths: - - / - PrototxtPath: /path/to/fluid/inference_model - ModelPath: /path/to/fluid/inference_model - # ... -``` - -### 3、转换 -在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。 - - -### 4、预览 -最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。 - -> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。 diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md deleted file mode 100644 index f2783eb9f591a31443f2a692ce0eb1bcc9b1063a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md +++ /dev/null @@ -1,405 +0,0 @@ -# 如何增加新的Operator - -## 基本概念 - -简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。 - -```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。 - -```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。 - -saber的文件结构: -* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。 -* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。 -* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。 -* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。 - -### 涉及到的基类及各个类之前的关系 - -简单介绍相关的基类 - -* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h - -* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。 - -* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。 - -## 添加operator - -添加一个新的op需要以下几步: - -1. 添加saber的param -2. 定义saber的Operator类 -3. 定义新的impl声明 -3. 完成新的impl实现 -4. 增加framework的实现或特化 - -接下来就针对这几步,以一个简单例子为例介绍实现。 - -例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$ - -### 为operator增加param - -涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。 -这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。 -``` -template // 能够获得target, datatype, layout -struct MulParam{ - MulParam() - : alpha(0) - {} - MulParam(float alpha_in) - : alpha(alpha_in) - {} - MulParam(const MulParam& right) - : alpha(right.alpha) - {} - MulParam &operator=(const MulParam &right) { - alpha = right.alpha; - } - bool operator==(const MulParam &right) { - return alpha == right.alpha; - } - float alpha; -}; -``` - -### 定义Operator类 -涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。 -下面给出一个相对完整的定义结构供参考。 -``` -//不同的设备需要包含对应的operator实现.[详见](#impl) -#ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_mul.h" -#include "saber/funcs/impl/cuda/vender_mul.h" -#endif -//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare) -#ifdef USE_X86_PLACE -#include "saber/funcs/impl/impl_mul.h" -#endif -namespace anakin { -namespace saber { -template -class Mul : public BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, MulParam> { -public: - using BaseFunc< - Tensor, - Tensor, - Tensor, - ImplBase, MulParam>::BaseFunc; - Mul() = default; - typedef Tensor InDataTensor; - typedef Tensor OutDataTensor; - typedef Tensor OpTensor; - typedef MulParam Param_t; - typedef std::vector Input_v; - typedef std::vector Output_v; - typedef std::vector Shape_v; - - virtual SaberStatus compute_output_shape(const Input_v &input, - Output_v &output, Param_t ¶m) override { - //计算输出的shape, - Shape output_shape = (input[0]->valid_shape()); - /* code */ - return output[0]->set_shape(output_shape); - } - virtual SaberStatus init_impl(ImplEnum implenum) override { - // 不同设备均使用此init_impl, 此接口创建对应impl的实现。 - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderMul ); - return SaberSuccess; - case SABER_IMPL: - this->_impl.push_back(new SaberMul ); - return SaberSuccess; - default: - return SaberUnImplError; - } - } -private: - virtual void pick_best_static() override { - if (true) // some condition? - this->_best_impl = this->_impl[0]; - } - virtual void pick_best_specify(ImplEnum implenum) override { - this->_best_impl = this->_impl[0]; - } -}; -} // namespace saber -} // namespace anakin -``` - -### 为operator增加新的impl声明 - -涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。 -``` -#include "saber/funcs/impl/impl_macro.h" -namespace anakin{ -namespace saber{ -DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字 -} -} -``` - -### 完成新的operator特定后端实现 - -涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h``` -这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。 - -``` -// include 对应的声明 -#include "saber/funcs/impl/impl_mul.h" - -namespace anakin{ -namespace saber{ -template -class VenderMul : - public ImplBase< - Tensor, - Tensor, - Tensor, - MulParam > > -{ -public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTensor_in::Dtype InDataType; - typedef typename DataTensor_out::Dtype OutDataType; - typedef typename OpTensor::Dtype OpDataType; - VenderMul(){} - ~VenderMul() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - MulParam& param, Context& ctx) { - this->_ctx = ctx; - create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - MulParam& param, Context& ctx) { - // set内部参数 - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - MulParam& param) { - // dispatch kernel. - } - -private: -}; -} -} -``` -```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。 -### 添加framework的特化 - -涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。 -这里简单介绍下如果添加或修改framework内的operator - -``` -#include "framework/core/base.h" -#include "framework/core/data_types.h" -#include "framework/core/operator/operator.h" -#include "utils/logger/logger.h" -#include "saber/funcs/mul.h" // 需要包对应的saber头文件 -namespace anakin { -namespace ops { -template -class MulHelper; - -template -class Mul : public Operator { -public: - Mul() {} - /// forward impl - virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">"; - } - friend class MulHelper; -}; -template -class MulHelper : public OperatorHelper { -public: - MulHelper() = default; - ~MulHelper(); - Status InitParam() override; - - Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; - -public: - saber::MulParam> _param_mul; - saber::Mul _funcs_mul; -}; -} -} /* namespace anakin */ -``` -对应的```.cpp```文件如下: -``` -#include "framework/operators/mul.h" - -namespace anakin { -namespace ops { - -#ifdef USE_CUDA -template<> -void Mul::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_mul; - impl->_funcs_mul(ins, outs, param, ctx); -} -#endif - -template -Status MulHelper::InitParam() { - auto alpha = GET_PARAMETER(float, alpha); - MulParam> param_mul(alpha); - _param_mul = param_mul; - return Status::OK(); -} - -template -Status MulHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - - SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx)); - return Status::OK(); -} - -template -Status MulHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul)); - return Status::OK(); -} - -#ifdef USE_CUDA -template class MulHelper; -#endif -#ifdef USE_ARM_PLACE -template class MulHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32); -#endif -//! register op -ANAKIN_REGISTER_OP(Mul) -.Doc("Mul operator") -#ifdef USE_CUDA -.__alias__("mul") -#endif -#ifdef USE_ARM_PLACE -.__alias__("mul") -#endif -.num_in(1) -.num_out(1) -.Args("alpha", " alpha of Mul "); //注册 - -} /* namespace ops */ - -} /* namespace anakin */ -``` - -## 实现单元测试 -涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp``` -在对应的test下需要添加新的单元测试 - -``` -TEST(TestSaberFuncNV, test_depthwise_conv) { - - // init tensors and some param. - - // start Reshape & doInfer - Context ctx1(0, 1, 1); - - // create param - MulParam > param(alpha); - - std::vector*> input; - std::vector*> output; - - // create saber op - Mul mul; - - // compute output shape - mul.compute_output_shape(input, output, param); - - // re_alloc output tensors memory based on output shape - output[0]->re_alloc(output[0]->shape()); - - // init saber op(calling init and create) - mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1); - - // call operator() - mul(input, output, param, ctx1); - - // cuda specified, record events - cudaStream_t cuda_stream = ctx1.get_compute_stream(); - output[0]->record_event(cuda_stream); - output_dev.sync(); - - // param changed - param.alpha = 2.0; - // auto calling saber op(create and dispatch) - mul(input, output, param, ctx1); - - cudaDeviceSynchronize(); - CUDA_CHECK(cudaPeekAtLastError()); -} - -int main(int argc, const char** argv){ - anakin::saber::Env::env_init(); - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - -``` -## 调试及注意事项 - -一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现, diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md deleted file mode 100644 index a1f75f5e95cfb90f26d3782ba30a6d1887a70424..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md +++ /dev/null @@ -1,459 +0,0 @@ -# 如何支持一个新的设备 - -## 概览 - -添加一个新的设备需要以下3个步骤: - -* [在`CMakeList`中添加设备的支持](#0001) -* [在`saber`中添加设备的实现](#0002) -* [在`framework`中添加设备的具体化或实例化](#0003) - -假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。 - -## 在`CMakeList`中添加设备的支持 ## - -* 修改根目录`CMakeList.txt` -```cmake -#select the plantform to build -anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO) -anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO) -anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO) -anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES) -``` - -* 修改`saber/CMakeList.txt` - -根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。 -```cmake -if(USE_TNEW_PLACE) - anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC) - anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC) -endif() -``` - -* 修改`test/CMakeList.txt` - -新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。 -```cmake -if(USE_TNEW_PLACE) - anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC) -endif() -``` - -* 修改`cmake/anakin_config.h.in` -```c++ -// plantform to use -#cmakedefine USE_GPU_PLACE - -#cmakedefine USE_X86_PLACE - -#cmakedefine USE_ARM_PLACE - -#cmakedefine USE_TNEW_PLACE -``` - -* 其他依赖和编译选项 -修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake` - - -## 在`saber`中添加设备的实现 ## -`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。 - -### 在`saber/saber_types.h`中添加设备 - -```c++ -enum TargetTypeEnum { - eINVALID = -1, - eNV = 1, - eAMD = 2, - eARM = 3, - eX86 = 4, - eNVHX86 = 5, - eTNEW = 6 -}; - -typedef TargetType NV; -typedef TargetType ARM; -typedef TargetType AMD; -typedef TargetType X86; -typedef TargetType TNEW; - -``` - -### 在`saber/core`中添加设备的实现 - -1. 在`target_traits.h`中添加新设备 - -* 增加设备类型 -```c++ -struct __cuda_device{}; -struct __arm_device{}; -struct __amd_device{}; -struct __x86_device{}; -struct __tnew_device{}; -``` - -* `TargetTypeTraits`模板具体化 -```c++ -template <> -struct TargetTypeTraits { - typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择 - typedef __tnew_device target_type; -}; -``` - -2. 在`data_traits.h`中特化`DataTrait`模板类 - -如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下: -```c++ -#ifdef USE_OPENCL -struct ClMem{ - ClMem(){ - dmem = nullptr; - offset = 0; - } - - ClMem(cl_mem* mem_in, int offset_in = 0) { - dmem = mem_in; - offset = offset_in; - } - - ClMem(ClMem& right) { - dmem = right.dmem; - offset = right.offset; - } - - ClMem& operator=(ClMem& right) { - this->dmem = right.dmem; - this->offset = right.offset; - return *this; - } - - ClMem& operator+(int offset_in) { - this->offset += offset_in; - return *this; - } - - int offset{0}; - cl_mem* dmem; -}; - -template <> -struct DataTrait { - typedef ClMem Dtype; - typedef float dtype; -}; - -template <> -struct DataTrait { - typedef ClMem Dtype; - typedef double dtype; -}; - -template <> -struct DataTrait { - typedef ClMem Dtype; - typedef char dtype; -}; -#endif //use_opencl -``` - -3. 在`target_wrapper.h`中特化`TargetWrapper`模板类 - -特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下: -```c++ -template <> -struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target - - typedef xxx_event event_t; //根据设备实现xxx_event - typedef xxx_stream stream_t; //根据设备实现xxx_stream - - static void get_device_count(int& count); - - static void set_device(int id); - - //We should add strategy to avoid malloc directly - static void mem_alloc(void** ptr, size_t n); - - static void mem_free(void* ptr); - - static void mem_set(void* ptr, int value, size_t n); - - static void create_event(event_t& event, bool flag = false); - - static void create_stream(stream_t& stream); - - static void create_stream_with_flag(stream_t& stream, unsigned int flag); - - static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority); - - static void destroy_stream(stream_t& stream); - - static void destroy_event(event_t& event); - - static void record_event(event_t& event, stream_t stream); - - static void query_event(event_t& event); - - static void sync_event(event_t& event); - - static void sync_stream(event_t& event, stream_t& stream); - - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoD); - - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoD); - - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __HtoD); - - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __HtoD); - - static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, __DtoH); - - static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \ - size_t count, stream_t& stream, __DtoH); - - static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count); - - static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \ - int src_dev, size_t count, stream_t& stream); - - static int get_device_id(); -}; - -``` - -4. 在`impl/`目录下添加设备目录和实现 - -在`saber/core/impl`目录下添加设备目录`tnew`。 -* 实现`TargetWrapper`结构体中各函数的定义。 -如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。 - -```c++ -typedef TargetWrapper TNEW_API; -void TNEW_API::get_device_count(int &count) { - // add implementation -} - -void TNEW_API::set_device(int id){ - // add implementation -} - -void TNEW_API::mem_alloc(void** ptr, size_t n){ - // add implementation -} - -void TNEW_API::mem_free(void* ptr){ - if(ptr != nullptr){ - // add implementation - } -} -... - -``` - -* 特化实现`device.h`中的`Device` - -```c++ -template <> -void Device::create_stream() { - // add implementation -} - -template <> -void Device::get_info() { - - // add implementation -} - -``` - -### 在`saber/funcs`中实现设备相关的op - -参考[如何增加新的Operator](addCustomOp.md) - - -## 在`framework`中添加设备的具体化或实例化 ## - -### `framework/core` - -* `net.cpp`中添加实例化 - -```c++ -#ifdef USE_TNEW_PLACE -template class Net; -template class Net; -#endif -``` - -* `operator_func.cpp`中添加实例化 - -```c++ -#ifdef USE_TNEW_PLACE -template class OperatorFunc; -#endif -``` - -* `worker.cpp`中添加实例化 - -```c++ -#ifdef USE_TNEW_PLACE -template class Worker; -template class Worker; -#endif -``` - -* `operator_attr.cpp`中添加实例化 - -```c++ -template -OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); -template -OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); -template -OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name); -``` - -* `parameter.h`中添加设备的实现 - -```c++ -#ifdef USE_TNEW_PLACE -template -class PBlock { -public: - typedef Tensor4d::type> type; - - PBlock() { - _inner_tensor = std::make_shared(); - } - ... -} -#endif //TNEW -``` - -* `type_traits_extend.h`中添加设备的实现 - -```c++ -template<> -struct target_host { - typedef saber::X86 type; //根据TNEW选择正确的host type -}; -``` - -### `framework/graph` - -* `graph.cpp`中添加实例化 - -```c++ - #ifdef USE_TNEW_PLACE - template class Graph; - template class Graph; - template class Graph; - #endif -``` - -### `framework/model_parser` - -* `parser.cpp`中添加实例化 - -```c++ - #ifdef USE_TNEW_PLACE - template - Status load(graph::Graph* graph, - const char* model_path); - template - Status load(graph::Graph* graph, - const char* model_path); - template - Status load(graph::Graph* graph, - const char* model_path); - - template - Status save(graph::Graph* graph, - std::string& model_path); - template - Status save(graph::Graph* graph, - std::string& model_path); - template - Status save(graph::Graph* graph, - std::string& model_path); - - template - Status load(graph::Graph* graph, - std::string& model_path); - template - Status load(graph::Graph* graph, - std::string& model_path); - template - Status load(graph::Graph* graph, - std::string& model_path); - - template - Status save(graph::Graph* graph, - const char* model_path); - template - Status save(graph::Graph* graph, - const char* model_path); - template - Status save(graph::Graph* graph, - const char* model_path); - #endif -``` - -* `model_io.cpp`中添加实例化 - -```c++ -#ifdef USE_TNEW_PLACE -template class NodeIO; -template class NodeIO; -template class NodeIO; -#endif -``` - -### `framework/operators` - -为`framework/operators`目录下所有op添加实例化或具体化 -以`activation.cpp`为例,实例化如下: - -```c++ -#ifdef USE_TNEW_PLACE -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32); -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16); -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8); -template class ActivationHelper; -ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32); -#endif -``` - -如果TNEW设备函数的实现与现有模板实现不一致,可以特化实现如下(以init()为例): -```c++ -#ifdef USE_TNEW_PLACE -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32); -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16); -INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8); -template <> -Status ActivationHelper::Init(OpContext &ctx,\ - const std::vector >& ins, \ - std::vector >& outs) { - SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式 - return Status::OK(); -} -ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32); -#endif -``` - -在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册 - -```c++ -#ifdef USE_TNEW_PLACE -.__alias__("activation") -#endif -``` - -## 注意事项 -不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现 diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst deleted file mode 100644 index e4682ccb94e6fc60e184632dff9ee16a6bf16ec0..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst +++ /dev/null @@ -1,26 +0,0 @@ -Anakin - 服务器端加速引擎 -####################### - - -使用文档 -~~~~~~~ - -.. toctree:: - :maxdepth: 1 - - install_anakin.md - convert_paddle_to_anakin.md - run_anakin_on_arm.md - anakin_tutorial.md - anakin_example.md - anakin_gpu_benchmark.md - anakin_arm_benchmark.md - -开发文档 -~~~~~~~ - -.. toctree:: - :maxdepth: 1 - - how_to_add_anakin_op.md - how_to_support_new_device_in_anakin.md diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst deleted file mode 100644 index 47df6392c123d520c701089db6ee1ae72e4f8ea5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/index_mobile.rst +++ /dev/null @@ -1,9 +0,0 @@ -移动端部署 -########## - -.. toctree:: - :maxdepth: 2 - - mobile_build.md - mobile_dev.md - diff --git a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md deleted file mode 100644 index bb7c1950308622e3de292268a718e6ec688e6ae6..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/install_anakin.md +++ /dev/null @@ -1,69 +0,0 @@ -## 从源码编译安装Anakin ## - -我们已经在CentOS 7.3上成功的安装和测试了Anakin,对于其他操作系统,我们将很快支持。 - -### 安装概览 ### - -* [在CentOS上安装 Anakin]() -* [在Ubuntu上安装 Anakin]() -* [在ARM上安装 Anakin](run_on_arm_ch.md) -* [验证安装]() - - -### 在CentOS上安装 Anakin ### -#### 1. 系统要求 #### - -* make 3.82+ -* cmake 2.8.12+ -* gcc 4.8.2+ -* g++ 4.8.2+ -* 其他需要补充的。。。 - -#### 2. 编译CPU版Anakin #### - -暂时不支持 - -#### 3. 编译支持NVIDIA GPU的Anakin #### - -- 3.1. 安装依赖 - - 3.1.1 protobuf - >$ git clone https://github.com/google/protobuf - >$ cd protobuf - >$ git submodule update --init --recursive - >$ ./autogen.sh - >$ ./configure --prefix=/path/to/your/insall_dir - >$ make - >$ make check - >$ make install - >$ sudo ldconfig - - - 如安装protobuf遇到任何问题,请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md) - -- 3.2 CUDA Toolkit - - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). - - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). -- 3.3 编译Anakin - >$ git clone https:/xxxxx - >$ cd anakin - >$ mkdir build - >$ camke .. - >$ make - - -#### 4. 编译支持AMD GPU的Anakin #### - -暂时还不支持 - - -### 在Ubuntu上安装 Anakin ### - -暂时还不支持 - - -### 在ARM上安装 Anakin ### - -暂时还不支持 - -### 验证安装 ### -we are coming soon... diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md deleted file mode 100644 index e51593164987d548e256ddebbc5fa8d960fb5255..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_build.md +++ /dev/null @@ -1,59 +0,0 @@ -# 环境搭建 -## 使用 docker -### 1. 安装 docker -安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/) -### 2. 使用 docker 搭建构建环境 -首先进入 paddle-mobile 的目录下,执行 `docker build` -以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行) -``` -$ docker build -t paddle-mobile:dev - < Dockerfile -``` -使用 `docker images` 可以看到我们新建的 image -``` -$ docker images -REPOSITORY TAG IMAGE ID CREATED SIZE -paddle-mobile dev 33b146787711 45 hours ago 372MB -``` -### 3. 使用 docker 构建 -进入 paddle-mobile 目录,执行 docker run -``` -$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev -root@5affd29d4fc5:/ # cd /paddle-mobile -# 生成构建 android 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake -# 生成构建 linux 产出的 Makefile -root@5affd29d4fc5:/ # rm CMakeCache.txt -root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake -``` -### 4. 设置编译选项 -可以通过 ccmake 设置编译选项 -``` -root@5affd29d4fc5:/ # ccmake . - Page 1 of 1 - CMAKE_ASM_FLAGS - CMAKE_ASM_FLAGS_DEBUG - CMAKE_ASM_FLAGS_RELEASE - CMAKE_BUILD_TYPE - CMAKE_INSTALL_PREFIX /usr/local - CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake - CPU ON - DEBUGING ON - FPGA OFF - LOG_PROFILE ON - MALI_GPU OFF - NET googlenet - USE_EXCEPTION ON - USE_OPENMP OFF -``` -修改选项后,按 `c`, `g` 更新 Makefile -### 5. 构建 -使用 make 命令进行构建 -``` -root@5affd29d4fc5:/ # make -``` -### 6. 查看构建产出 -构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行 - -## 不使用 docker -不使用 docker 的方法,可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具,可能需要设置 CC,CXX 环境变量,或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake,或者增加自己需要的 toolchain file。 diff --git a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md b/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md deleted file mode 100644 index 474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/mobile_dev.md +++ /dev/null @@ -1,72 +0,0 @@ -# iOS开发文档 - -## 编译 - -### 一. 使用 build.sh 编译 - -```sh -sh build.sh ios - -# 如果只想编译某个特定模型的 op, 则需执行以下命令 -sh build.sh ios googlenet - -# 在这个文件夹下, 你可以拿到生成的 .a 库 -cd ../build/release/ios/build - -``` - -### 二. 使用 xcode 编译 - -我们提供了 ios 开发更为熟悉的 xcode 编译环境: -在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo - -### 三. 集成 - -#### 如使用 c++ 接口 -将 - -``` -libpaddle-mobile.a -io.h -program.h -types.h -lod_tensor.h -tensor.h -``` -拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释 - -#### 如使用 oc 接口 -将在xcode 编译生成的 -``` -libPaddleMobile.a -PaddleMobile.h -``` -拖入工程, 接口如下: - -``` -/* - 创建单例对象 -*/ -+ (instancetype)sharedInstance; - -/* - load 模型, 开辟内存 -*/ -- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; - -/* - 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict -*/ -- (NSArray *)predict:(CGImageRef)image means:(NSArray *)means scale:(float)scale; - -/* - 进行预测 -*/ -- (NSArray *)predict:(CGImageRef)image; - -/* - 清理内存 -*/ -- (void)clear; - -``` diff --git a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md deleted file mode 100644 index ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/deploy/run_anakin_on_arm.md +++ /dev/null @@ -1,151 +0,0 @@ -## 源码编译 Anakin ## - -目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。 - -### 安装概览 ### - -* [系统需求](#0001) -* [安装第三方依赖](#0002) -* [Anakin源码编译](#0003) -* [验证安装](#0004) - - -### 1. 系统需求 ### - -* 宿主机: linux, mac -* cmake 3.8.2+ -* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip) - -### 2. 安装第三方依赖 ### - -- 2.1 protobuf3.4.0 - 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0) - - 2.1.1 为宿主机编译protobuf - ```bash - $ tar -xzf protobuf-3.4.0.tar.gz - $ cd protobuf-3.4.0 - $ ./autogen.sh - $ ./configure - $ make - $ make check - $ make install - ``` - 上述 $make install 执行后,可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下, - 如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。 - 然后将已经生成文件清除。 - ```bash - $ make distclean - ``` - - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值, - ```bash - - $ export ANDROID_NDK=your_ndk_path - $ ARCH_ABI="arm-linux-androideabi-4.9" - $ HOSTOSN="darwin-x86_64" - $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm - $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI - $ export LDFLAGS="--sysroot=$SYSROOT" - $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS" - $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a" - $ export CPPFLAGS="" - $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/" - $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT" - $ export CCFLAGS="$CXXFLAGS" - $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS" - $ export CC="$CXX" - $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib" - $ ./autogen.sh - $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD" - $ make - ``` - - 编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared。 - 生成文件在src/.libs/下,将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。 - 在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。 - ```cmake - set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf") - ``` - -- 2.2 opencv 2.4.3+(optional) - Anakin只在examples示例中使用opencv - Android系统的opencv从[这里下载](https://opencv.org/releases.html) - 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a` - 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, - 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。 - ```cmake - include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) - LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) - ``` -### 3. Anakin源码编译 ### - -#### 编译Android版本 - - 克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm) -```bash - cd your_dir - git clone https://github.com/PaddlePaddle/Anakin.git - cd Anakin - git fetch origin arm - git checkout arm - ``` - 修改`android_build.sh` -- 修改NDK路径 - ```bash - #modify "your_ndk_path" to your NDK path - export ANDROID_NDK=your_ndk_path - ``` -- 修改ARM 处理器架构 - 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`, - 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。 - 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中。 - ```bash - -DANDROID_ABI="armeabi-v7a with NEON" - ``` -- 设置Android API - 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1 - ```bash - -DANDROID_NATIVE_API_LEVEL=21 - ``` - -- 选择编译静态库或动态库 - 设置`BUILD_SHARED=NO`编译静态库 - 设置`BUILD_SHARED=YES`编译动态库 - ```bash - -DBUILD_SHARED=NO - ``` -- OpenMP多线程支持 - 设置`USE_OPENMP=YES`开启OpenMP多线程 - ```bash - -DUSE_OPENMP=YES - ``` - -- 编译单测文件 - 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件 - ```bash - -DBUILD_WITH_UNIT_TEST=YES - ``` - -- 编译示例文件 - 设置`BUILD_EXAMPLES=YES`将会编译示例文件 - ```bash - -DBUILD_EXAMPLES=YES - ``` - -- 开启opencv - 如果使用opencv,设置`USE_OPENCV=YES` - ```bash - -DUSE_OPENCV=YES - ``` - -- 开始编译 - 运行脚本 `android_build.sh` 将自动编译Anakin - ```bash - ./android_build.sh - ``` - -### 4. 验证安装 ### - 编译好的库会放在目录`${Anakin_root}/output`下; - 编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下; - 编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。 - - 对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录, 运行测试文件。 diff --git a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md deleted file mode 120000 index 1126df7a829ab6d98e58a44e8f9c6459feae9a8b..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md +++ /dev/null @@ -1 +0,0 @@ -../../../dev/contribute_to_paddle_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md deleted file mode 120000 index 1381a3b05f6761c60742eb9365708d94ad8a2642..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md +++ /dev/null @@ -1 +0,0 @@ -../../../howto/optimization/cpu_profiling_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst b/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst deleted file mode 100644 index f2396716bddd4810fa77c738d41f5482aa6d6055..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/gpu_profiling_cn.rst +++ /dev/null @@ -1,242 +0,0 @@ -============ -GPU性能调优 -============ - -.. contents:: - -此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。 - -- 什么是性能分析? -- 为什么需要性能分析? -- 如何进行性能分析? -- 性能分析工具介绍 -- 详细教程 -- 性能分析小技巧 - -什么是性能分析? -================ -在软件工程的范畴里,性能分析(Profiling)是一个动态程序分析的术语,它可以指测量一个程序的空间(内存)复杂度或时间复杂度, -也可以说是某些特定指令的使用情况,或者是函数调用的频率和耗时等。通常情况下,分析得到的信息用于协助进行程序的优化。 - -简单来说,性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为,那程序分析工具是必不可少的利器。简单的性能分析,可以告诉您某个操作到底花了多长时间?而更深入的分析,甚至能解释为什么某个操作花了很长时间? - -为什么需要性能分析? -============================ -训练好一个深层神经网络通常要耗费非常长的时间,所以性能也就逐步变成了深度学习领域最重要的指标。 -而优化性能的首要任务,是需要了解哪些步骤拖慢了整体。 -如果某一块根本就不怎么耗时,那也就不需要急着优化性能啦! - -如何进行性能分析? -======================== -为了达到性能最优,您可以采用下面五个步骤: - -- 对代码进行性能分析 -- 找到运行慢的部分 -- 找到运行慢的原因 -- 修改成更快的版本 -- 再次对代码进行性能分析 - -Usually, processor has two key performance limits include float point throughput and -memory throughput. For GPU, it also need more parallelism to fulfill its potential. -This is why they can be so fast. - -通常情况下,处理器有两个关键性能限制:一个是浮点计算量,另一个是内存操作量。 -GPU则还需要高并行性,才能发挥其全部能力。这正是它们速度快的原因。 - -性能分析工具介绍 -====================== -就通常的GPU性能分析来说,市面上已经有NVIDIA或第三方提供的众多工具。 - -**nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。 -在这个教程中,我们主要会介绍nvprof和nvvp。 - -:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate -above profilers. - -:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。 - -.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :linenos: - -上述的代码片段包含了两种方法,您可以任意使用一个或两个来对感兴趣的代码段做性能分析。 - -1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装,可以用来计算CPU函数或cuda内核的时间消耗。 - -2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid -program crashes when CPU version of PaddlePaddle invokes them. - -3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象,封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作;同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。 - -您会在接下来的部分中获得更多的细节介绍。 - -详细教程 -============ - -内置定时器 ------------- - -如果想要启用PaddlePaddle的内置定时器,您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。 -接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。 -下面举个简单的例子: - -1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。 - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 8-12,14 - :linenos: - -2. cmake配置中将 **WITH_TIMER** 打开,重新编译PaddlePaddle。 - - .. code-block:: bash - - cmake .. -DWITH_TIMER=ON - make - -3. 执行您的代码,并观察结果(如高亮部分)。 - - .. code-block:: bash - :emphasize-lines: 1,12-15 - - > ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions - I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. - [==========] Running 1 test from 1 test case. - [----------] Global test environment set-up. - [----------] 1 test from Profiler - [ RUN ] Profiler.BilinearFwdBwd - I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im - gSizeX = 64, imgSizeY = 64" - I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751 - I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ====== - I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1 - I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ====== - I1117 11:13:42.981575 2522362816 Stat.cpp:154] -------------------------------------------------- - [ OK ] Profiler.BilinearFwdBwd (136 ms) - [----------] 1 test from Profiler (136 ms total) - - [----------] Global test environment tear-down - [==========] 1 test from 1 test case ran. (136 ms total) - [ PASSED ] 1 test. - -nvprof 工具 ----------------- - -要使用命令行分析工具 **nvprof**,您按如下步骤操作即可: - -1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。 - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 6-7 - :linenos: - -2. cmake中将 **WITH_PROFILER** 配置打开,重新编译PaddlePaddle。 - - .. code-block:: bash - - cmake .. -DWITH_PROFILER=ON - make - -3. 使用 **nvprof** 来分析执行文件。 - - .. code-block:: bash - - nvprof ./paddle/legacy/math/tests/test_GpuProfiler - -然后,您就能获得如下的分析结果: - -.. code-block:: bash - - ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler - ==78544== Profiling result: - Time(%) Time Calls Avg Min Max Name - 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] - 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw - 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw - 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH] - - ==78544== API calls: - Time(%) Time Calls Avg Min Max Name - 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags - 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree - 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate - 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy - 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize - 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc - 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc - 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice - 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags - 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute - 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount - 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties - 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch - 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName - 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem - 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice - 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate - 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute - 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart - 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall - 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError - 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument - 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet - 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount - 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion - 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit - 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion - - -nvvp 工具 --------------- - -如果想使用可视化的分析器 **nvvp**,您可以导入 :code:`nvprof -o ...` 的输出,或者从工具的界面里运行您的应用。 - -**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启) - -.. image:: nvvp1.png - :align: center - :scale: 33% - -从内核函数的角度, **nvvp** 可以精确说明一个长耗时操作的具体原因。 -同时,如下图所示, **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。 - - -.. image:: nvvp2.png - :align: center - :scale: 33% - -而从应用的角度, **nvvp** 可以帮您提供一些定位性能瓶颈的建议。 -例如,下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议,为您做性能调优提供了方向。 - -.. image:: nvvp3.png - :align: center - :scale: 33% - -.. image:: nvvp4.png - :align: center - :scale: 33% - -性能分析小技巧 -================== - -- 开始阶段,从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。 -- 接下来可以考虑下时间线的分析。 -- 如果真想挖掘内核深处的某个秘密,您最好先确认:这一块的耗时比例真的太高,值得深入分析。 -- 可能的情况下,试着让输出的分析数据和理论值对应。 - - 1) 例如,如果我知道内核花了10ms来移动1GB数据,那我会期望分析工具统计到速度是100GB/s。 - 2) 若有不一致之处,很有可能实际应用就是没有按照您的预期情况运行。 -- 了解您的硬件:如果您的GPU理论可以达到6 TFLOPs(6万亿次浮点运算每秒),而当前已经有5.5 TFLOPs了,那估计这里的潜力就没啥好挖的了…… - -性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果! -当然,具体情况因人而异。 - -参考资料 -=========== -Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015 diff --git a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md deleted file mode 120000 index 904968ba4a8d6cc6489c91a0a751e0a33dcc873c..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md +++ /dev/null @@ -1 +0,0 @@ -../../../howto/optimization/host_memory_profiling_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/development/new_op.md b/doc/fluid/new_docs/advanced_usage/development/new_op.md deleted file mode 120000 index dce0348585b8c484c1418a03a5fde5d78b0afcc9..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/new_op.md +++ /dev/null @@ -1 +0,0 @@ -../../../dev/new_op_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png b/doc/fluid/new_docs/advanced_usage/development/nvvp1.png deleted file mode 100644 index 1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/nvvp1.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png b/doc/fluid/new_docs/advanced_usage/development/nvvp2.png deleted file mode 100644 index 177c9db708da6863d1075f3e615f5962dbe18b29..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/nvvp2.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png b/doc/fluid/new_docs/advanced_usage/development/nvvp3.png deleted file mode 100644 index d8f393667d6569b6f1e61ffccac43fae5888b6db..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/nvvp3.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png b/doc/fluid/new_docs/advanced_usage/development/nvvp4.png deleted file mode 100644 index 51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/nvvp4.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png b/doc/fluid/new_docs/advanced_usage/development/pprof_1.png deleted file mode 100644 index 8e9edbf377672d0ef40f2fc7bd39e746923550cb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/pprof_1.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png b/doc/fluid/new_docs/advanced_usage/development/pprof_2.png deleted file mode 100644 index 172ba20399ba974d27f4c072425277b69b02520b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/pprof_2.png and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg b/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg deleted file mode 100644 index 38ec3f80c982857531f30a8bb0fa26ea5bf05385..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/timeline.jpeg and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md deleted file mode 120000 index a05540e82a7fa795dcd8e7306261ef9bef57426f..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md +++ /dev/null @@ -1 +0,0 @@ -../../../howto/optimization/timeline_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg b/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg deleted file mode 100644 index 3a49fc4f8a401a9463b0157e2f38c164ca02dcc5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/development/tracing.jpeg and /dev/null differ diff --git a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst b/doc/fluid/new_docs/advanced_usage/development/write_docs.rst deleted file mode 120000 index dc536c8bdd4924758d4418bac8e4181ffbb1f780..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/development/write_docs.rst +++ /dev/null @@ -1 +0,0 @@ -../../../dev/write_docs_cn.rst \ No newline at end of file diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst deleted file mode 100644 index 89166573eebca045e948046c69f3b7a3e0031d58..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/advanced_usage/index.rst +++ /dev/null @@ -1,22 +0,0 @@ -######## -进阶使用 -######## - - -.. todo:: - - Complete this guide - -.. toctree:: - :maxdepth: 2 - - deploy/index_anakin.rst - deploy/index_mobile.rst - development/contribute_to_paddle.md - development/write_docs.rst - development/new_op.md - development/cpu_profiling_cn.md - development/gpu_profiling_cn.rst - development/host_memory_profiling_cn.md - development/timeline_cn.md - benchmark.rst diff --git a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png deleted file mode 100644 index 52d4992a22397119af949aa7c11a9ea6365c167c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/advanced_usage/pics/anakin_fm_ch.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore deleted file mode 100644 index dc7c62b06287ad333dd41082e566b0553d3a5341..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -*.pyc -train.log -output -data/cifar-10-batches-py/ -data/cifar-10-python.tar.gz -data/*.txt -data/*.list -data/mean.meta diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md deleted file mode 100644 index 4f20843596aa676962a36241f59560ec2a41257b..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md +++ /dev/null @@ -1,576 +0,0 @@ - -# 图像分类 - -本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。 - -## 背景介绍 - -图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 - -图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。 - - -一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。 - -而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。 - -本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。 - -## 效果展示 - -图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。 - -

-
-图1. 通用图像分类展示 -

- - -图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。 - - -

-
-图2. 细粒度图像分类展示 -

- - -一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。 - -

-
-图3. 扰动图片展示[22] -

- -## 模型概览 - -图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。 - -在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。 - - 1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。 - - 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 - - 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 - - 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 - -这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 - -Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 - -

-
-图4. ILSVRC图像分类Top-5错误率 -

- -### CNN - -传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。 - -

-
-图5. CNN网络示例[20] -

- -- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 -- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 -- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。 -- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。 -- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。 - -另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。 - -接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。 - -### VGG - -牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。 - -

-
-图6. 基于ImageNet的VGG16模型 -

- -### GoogleNet - -GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。 - -NIN模型主要有两个特点: - -1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。 - -2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。 - -Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。 - -

-
-图7. Inception模块 -

- -GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。 - -GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。 - -

-
-图8. GoogleNet[12] -

- - -上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层;GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。 - - -### ResNet - -ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。 - -残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。 - -

-
-图9. 残差模块 -

- -图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。 - -

-
-图10. 基于ImageNet的ResNet模型 -

- - -## 数据准备 - -通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。 - -由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10]()数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 - -

-
-图11. CIFAR10数据集[21] -

- -Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。 - -通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。 - -### 模型结构 - -#### Paddle 初始化 - -让我们从导入 Paddle Fluid API 和辅助模块开始。 - -```python -import paddle -import paddle.fluid as fluid -import numpy -import sys -from __future__ import print_function -``` - -本教程中我们提供了VGG和ResNet两个模型的配置。 - -#### VGG - -首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。 -VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: - -```python -def vgg_bn_drop(input): - def conv_block(ipt, num_filter, groups, dropouts): - return fluid.nets.img_conv_group( - input=ipt, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act='relu', - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type='max') - - conv1 = conv_block(input, 64, 2, [0.3, 0]) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) - fc1 = fluid.layers.fc(input=drop, size=512, act=None) - bn = fluid.layers.batch_norm(input=fc1, act='relu') - drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) - fc2 = fluid.layers.fc(input=drop2, size=512, act=None) - predict = fluid.layers.fc(input=fc2, size=10, act='softmax') - return predict -``` - - -1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。 - -2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 - -3. 最后接两层512维的全连接。 - -4. 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。 - -### ResNet - -ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。 - -先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。 - - - `conv_bn_layer` : 带BN的卷积层。 - - `shortcut` : 残差模块的"直连"路径,"直连"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。 - - `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条"直连"路径组成。 - - `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。 - - `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。 - -```python -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu', - bias_attr=False): - tmp = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=bias_attr) - return fluid.layers.batch_norm(input=tmp, act=act) - - -def shortcut(input, ch_in, ch_out, stride): - if ch_in != ch_out: - return conv_bn_layer(input, ch_out, 1, stride, 0, None) - else: - return input - - -def basicblock(input, ch_in, ch_out, stride): - tmp = conv_bn_layer(input, ch_out, 3, stride, 1) - tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True) - short = shortcut(input, ch_in, ch_out, stride) - return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') - - -def layer_warp(block_func, input, ch_in, ch_out, count, stride): - tmp = block_func(input, ch_in, ch_out, stride) - for i in range(1, count): - tmp = block_func(tmp, ch_out, ch_out, 1) - return tmp -``` - -`resnet_cifar10` 的连接结构主要有以下几个过程。 - -1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 - -2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 - -3. 最后对网络做均值池化并返回该层。 - -注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。 - -```python -def resnet_cifar10(ipt, depth=32): - # depth should be one of 20, 32, 44, 56, 110, 1202 - assert (depth - 2) % 6 == 0 - n = (depth - 2) / 6 - nStages = {16, 64, 128} - conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) - res2 = layer_warp(basicblock, res1, 16, 32, n, 2) - res3 = layer_warp(basicblock, res2, 32, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - predict = fluid.layers.fc(input=pool, size=10, act='softmax') - return predict -``` - -## Infererence Program 配置 - -网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32)。 - -```python -def inference_program(): - # The image is 32 * 32 with RGB representation. - data_shape = [3, 32, 32] - images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - - predict = resnet_cifar10(images, 32) - # predict = vgg_bn_drop(images) # un-comment to use vgg net - return predict -``` - -## Train Program 配置 - -然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。 -在训练期间,它将从预测中计算 `avg_cost`。 -在有监督训练中需要输入图像对应的类别信息,同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 - -**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 - -```python -def train_program(): - predict = inference_program() - - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) - accuracy = fluid.layers.accuracy(input=predict, label=label) - return [avg_cost, accuracy] -``` - -## Optimizer Function 配置 - -在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 - -```python -def optimizer_program(): - return fluid.optimizer.Adam(learning_rate=0.001) -``` - -## 训练模型 - -### Trainer 配置 - -现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -trainer = fluid.Trainer( - train_func=train_program, - optimizer_func=optimizer_program, - place=place) -``` - -### Data Feeders 配置 - -`cifar.train10()` 每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。 - -```python -# Each batch will yield 128 images -BATCH_SIZE = 128 - -# Reader for training -train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000), - batch_size=BATCH_SIZE) - -# Reader for testing. A separated data set for testing. -test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) -``` - -### Event Handler - -可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。 - -`event_handler_plot`可以用来利用回调数据来打点画图: - -

-
-图12. 训练结果 -

- - -```python -params_dirname = "image_classification_resnet.inference.model" - -from paddle.v2.plot import Ploter - -train_title = "Train cost" -test_title = "Test cost" -cost_ploter = Ploter(train_title, test_title) - -step = 0 -def event_handler_plot(event): - global step - if isinstance(event, fluid.EndStepEvent): - if step % 1 == 0: - cost_ploter.append(train_title, step, event.metrics[0]) - cost_ploter.plot() - step += 1 - if isinstance(event, fluid.EndEpochEvent): - avg_cost, accuracy = trainer.test( - reader=test_reader, - feed_order=['pixel', 'label']) - cost_ploter.append(test_title, step, avg_cost) - - # save parameters - if params_dirname is not None: - trainer.save_params(params_dirname) -``` - -`event_handler` 用来在训练过程中输出文本日志 - -```python -params_dirname = "image_classification_resnet.inference.model" - -# event handler to track training and testing process -def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - if event.step % 100 == 0: - print("\nPass %d, Batch %d, Cost %f, Acc %f" % - (event.step, event.epoch, event.metrics[0], - event.metrics[1])) - else: - sys.stdout.write('.') - sys.stdout.flush() - - if isinstance(event, fluid.EndEpochEvent): - # Test against with the test dataset to get accuracy. - avg_cost, accuracy = trainer.test( - reader=test_reader, feed_order=['pixel', 'label']) - - print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy)) - - # save parameters - if params_dirname is not None: - trainer.save_params(params_dirname) -``` - -### 训练 - -通过`trainer.train`函数训练: - -**注意:** CPU,每个 Epoch 将花费大约15~20分钟。这部分可能需要一段时间。请随意修改代码,在GPU上运行测试,以提高训练速度。 - -```python -trainer.train( - reader=train_reader, - num_epochs=2, - event_handler=event_handler, - feed_order=['pixel', 'label']) -``` - -一轮训练log示例如下所示,经过1个pass, 训练集上平均 Accuracy 为0.59 ,测试集上平均 Accuracy 为0.6 。 - -```text -Pass 0, Batch 0, Cost 3.869598, Acc 0.164062 -................................................................................................... -Pass 100, Batch 0, Cost 1.481038, Acc 0.460938 -................................................................................................... -Pass 200, Batch 0, Cost 1.340323, Acc 0.523438 -................................................................................................... -Pass 300, Batch 0, Cost 1.223424, Acc 0.593750 -.......................................................................................... -Test with Pass 0, Loss 1.1, Acc 0.6 -``` - -图13是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 - -

-
-图13. CIFAR10数据集上VGG模型的分类错误率 -

- -## 应用模型 - -可以使用训练好的模型对图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断,可以打开注释,更改加载的模型。 - -### 生成预测输入数据 - -`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format. - -```python -# Prepare testing data. -from PIL import Image -import numpy as np -import os - -def load_image(file): - im = Image.open(file) - im = im.resize((32, 32), Image.ANTIALIAS) - - im = np.array(im).astype(np.float32) - # The storage order of the loaded image is W(width), - # H(height), C(channel). PaddlePaddle requires - # the CHW order, so transpose them. - im = im.transpose((2, 0, 1)) # CHW - im = im / 255.0 - - # Add one dimension to mimic the list format. - im = numpy.expand_dims(im, axis=0) - return im - -cur_dir = os.getcwd() -img = load_image(cur_dir + '/image/dog.png') -``` - -### Inferencer 配置和预测 - -`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 -我们可以简单地插入前面定义的推理程序。 -现在我们准备做预测。 - -```python -inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) -label_list = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"] -# inference -results = inferencer.infer({'pixel': img}) -print("infer results: %s" % label_list[np.argmax(results[0])]) -``` - -## 总结 - -传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。 - - -## 参考文献 - -[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004. - -[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. - -[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. - -[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. - -[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997. - -[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR. - -[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4). - -[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR. - -[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS. - -[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012. - -[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。 - -[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015) - -[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014. - -[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015. - -[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016. - -[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016). - -[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016). - -[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015. - -[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015. - -[20] http://deeplearning.net/tutorial/lenet.html - -[21] https://www.cs.toronto.edu/~kriz/cifar.html - -[22] http://cs231n.github.io/classification/ - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst deleted file mode 100644 index 0fcb008e0a7773e81e5124da09fe07366130b924..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -################ -深度学习基础知识 -################ - - -.. todo:: - - 概述 - -.. toctree:: - :maxdepth: 2 - - image_classification/README.cn.md - word2vec/README.cn.md - recommender_system/README.cn.md - understand_sentiment/README.cn.md - label_semantic_roles/README.cn.md - machine_translation/README.cn.md diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore deleted file mode 100644 index 29b5622a53a1b0847e9f53febf1cc50dcf4f044a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -data/train.list -data/test.* -data/conll05st-release.tar.gz -data/conll05st-release -data/predicate_dict -data/label_dict -data/word_dict -data/emb -data/feature -output -predict.res -train.log diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md deleted file mode 100644 index 0891f5b6b16a1b715b44db6c47ba079adfcad4c5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md +++ /dev/null @@ -1,562 +0,0 @@ -# 语义角色标注 - -本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。 - -## 背景介绍 - -自然语言分析技术大致分为三个层面:词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中,谓词是对主语的陈述或说明,指出“做什么”、“是什么”或“怎么样,代表了一个事件的核心,跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有:施事者(Agent)、受事者(Patient)、客体(Theme)、经验者(Experiencer)、受益者(Beneficiary)、工具(Instrument)、处所(Location)、目标(Goal)和来源(Source)等。 - -请看下面的例子,“遇到” 是谓词(Predicate,通常简写为“Pred”),“小明”是施事者(Agent),“小红”是受事者(Patient),“昨天” 是事件发生的时间(Time),“公园”是事情发生的地点(Location)。 - -$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ - -语义角色标注(Semantic Role Labeling,SRL)以句子的谓词为中心,不对句子所包含的语义信息进行深入分析,只分析句子中各成分与谓词之间的关系,即句子的谓词(Predicate)- 论元(Argument)结构,并用语义角色来描述这些结构关系,是许多自然语言理解任务(如信息抽取,篇章分析,深度问答等)的一个重要中间步骤。在研究中一般都假定谓词是给定的,所要做的就是找出给定谓词的各个论元和它们的语义角色。 - -传统的SRL系统大多建立在句法分析基础之上,通常包括5个流程: - -1. 构建一棵句法分析树,例如,图1是对上面例子进行依存句法分析得到的一棵句法树。 -2. 从句法树上识别出给定谓词的候选论元。 -3. 候选论元剪除;一个句子中的候选论元可能很多,候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。 -4. 论元识别:这个过程是从上一步剪除之后的候选中判断哪些是真正的论元,通常当做一个二分类问题来解决。 -5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。 - -
-
-图1. 依存句法分析句法树示例 -
- -然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个由角色A拓展得到的语块组,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 - -我们继续以上面的这句话为例,图1展示了BIO表示方法。 - -
-
-图2. BIO标注方法示例 -
- -从上面的例子可以看到,根据序列标注结果可以直接得到论元的语义角色标注结果,是一个相对简单的过程。这种简单性体现在:(1)依赖浅层句法分析,降低了句法分析的要求和难度;(2)没有了候选论元剪除这一步骤;(3)论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法,简化了流程,降低了错误累积的风险,往往能够取得更好的结果。 - -与基于语块的SRL方法类似,在本教程中我们也将SRL看作一个序列标注问题,不同的是,我们只依赖输入文本序列,不依赖任何额外的语法解析结果或是复杂的人造特征,利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例,实践下面的任务:给定一句话和这句话里的一个谓词,通过序列标注的方式,从句子中找到谓词对应的论元,同时标注它们的语义角色。 - -## 模型概览 - -循环神经网络(Recurrent Neural Network)是一种对序列建模的重要模型,在自然语言处理任务中有着广泛地应用。不同于前馈神经网络(Feed-forward Neural Network),RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种,常用来学习长序列中蕴含的长程依赖关系,我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过,这一篇中我们依然利用LSTM来解决SRL问题。 - -### 栈式循环神经网络(Stacked Recurrent Neural Network) - -深层网络有助于形成层次化特征,网络上层在下层已经学习到的初级特征基础上,形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络,但由于LSTM各个时间步参数共享,$t-1$时刻状态到$t$时刻的映射,始终只经过了一次非线性映射,也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元,令前一个LSTM$t$时刻的输出,成为下一个LSTM单元$t$时刻的输入,帮助我们构建起一个深层网络,我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力,能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。 - -然而,训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常,堆叠4层LSTM单元可以正常训练,当层数达到4~8层时,会出现性能衰减,这时必须考虑一些新的结构以保证梯度纵向顺畅传播,这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一:在记忆单元(Memory Cell)这条信息传播的路线上没有非线性映射,当梯度反向传播时既不会衰减、也不会爆炸。因此,深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。 - -一个LSTM单元完成的运算可以被分为三部分:(1)输入到隐层的映射(input-to-hidden) :每个时间步输入信息$x$会首先经过一个矩阵映射,再作为遗忘门,输入门,记忆单元,输出门的输入,注意,这一次映射没有引入非线性激活;(2)隐层到隐层的映射(hidden-to-hidden):这一步是LSTM计算的主体,包括遗忘门,输入门,记忆单元更新,输出门的计算;(3)隐层到输出的映射(hidden-to-output):通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上,加入一条新的路径:除上一层LSTM输出之外,将前层LSTM的输入到隐层的映射作为的一个新的输入,同时加入一个线性映射去学习一个新的变换。 - -图3是最终得到的栈式循环神经网络结构示意图。 - -

-
-图3. 基于LSTM的栈式循环神经网络结构示意图 -

- -### 双向循环神经网络(Bidirectional Recurrent Neural Network) - -在LSTM中,$t$时刻的隐藏层向量编码了到$t$时刻为止所有输入的信息,但$t$时刻的LSTM可以看到历史,却无法看到未来。在绝大多数自然语言处理任务中,我们几乎总是能拿到整个句子。这种情况下,如果能够像获取历史信息一样,得到未来的信息,对序列学习任务会有很大的帮助。 - -为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 - -

-
-图4. 基于LSTM的双向循环神经网络结构示意图 -

- -需要说明的是,这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同,我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中,介绍另一种双向循环神经网络。 - -### 条件随机场 (Conditional Random Field) - -使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务。在SRL任务中,深层LSTM网络学习输入的特征表示,条件随机场(Conditional Random Filed, CRF)在特征的基础上完成序列标注,处于整个网络的末端。 - -CRF是一种概率化结构模型,可以看作是一个概率无向图模型,结点表示随机变量,边表示随机变量之间的概率依赖关系。简单来讲,CRF学习条件概率$P(X|Y)$,其中 $X = (x_1, x_2, ... , x_n)$ 是输入序列,$Y = (y_1, y_2, ... , y_n)$ 是标记序列;解码过程是给定 $X$序列求解令$P(Y|X)$最大的$Y$序列,即$Y^* = \mbox{arg max}_{Y} P(Y | X)$。 - -序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 - -

-
-图5. 序列标注任务中使用的线性链条件随机场 -

- -根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\],在给定观测序列$X$时,一个特定标记序列$Y$的概率可以定义为: - -$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ - -其中$Z(X)$是归一化因子,$t_j$ 是定义在边上的特征函数,依赖于当前和前一个位置,称为转移特征,表示对于输入序列$X$及其标注序列在 $i$及$i - 1$位置上标记的转移概率。$s_k$是定义在结点上的特征函数,称为状态特征,依赖于当前位置,表示对于观察序列$X$及其$i$位置的标记概率。$\lambda_j$ 和 $\mu_k$ 分别是转移特征函数和状态特征函数对应的权值。实际上,$t$和$s$可以用相同的数学形式表示,再对转移特征和状态特在各个位置$i$求和有:$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$,把$f$统称为特征函数,于是$P(Y|X)$可表示为: - -$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ - -$\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标: - -$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ - -这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。 - -### 深度双向LSTM(DB-LSTM)SRL模型 - -在SRL任务中,输入是 “谓词” 和 “一句话”,目标是从这句话中找到谓词的论元,并标注论元的语义角色。如果一个句子含有$n$个谓词,这个句子会被处理$n$次。一个最为直接的模型是下面这样: - -1. 构造输入; - - 输入1是谓词,输入2是句子 - - 将输入1扩展成和输入2一样长的序列,用one-hot方式表示; -2. one-hot方式的谓词序列和句子序列通过词表,转换为实向量表示的词向量序列; -3. 将步骤2中的2个词向量序列作为双向LSTM的输入,学习输入序列的特征表示; -4. CRF以步骤3中模型学习到的特征为输入,以标记序列为监督信号,实现序列标注; - -大家可以尝试上面这种方法。这里,我们提出一些改进,引入两个简单但对提高系统性能非常有效的特征: - -- 谓词上下文:上面的方法中,只用到了谓词的词向量表达谓词相关的所有信息,这种方法始终是非常弱的,特别是如果谓词在句子中出现多次,有可能引起一定的歧义。从经验出发,谓词前后若干个词的一个小片段,能够提供更丰富的信息,帮助消解歧义。于是,我们把这样的经验也添加到模型中,为每个谓词同时抽取一个“谓词上下文” 片段,也就是从这个谓词前后各取$n$个词构成的一个窗口片段; -- 谓词上下文区域标记:为句子中的每一个词引入一个0-1二值变量,表示它们是否在“谓词上下文”片段中; - -修改后的模型如下(图6是一个深度为4的模型结构示意图): - -1. 构造输入 - - 输入1是句子序列,输入2是谓词序列,输入3是谓词上下文,从句子中抽取这个谓词前后各$n$个词,构成谓词上下文,用one-hot方式表示,输入4是谓词上下文区域标记,标记了句子中每一个词是否在谓词上下文中; - - 将输入2~3均扩展为和输入1一样长的序列; -2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列;其中输入1、3共享同一个词表,输入2和4各自独有词表; -3. 第2步的4个词向量序列作为双向LSTM模型的输入;LSTM模型学习输入序列的特征表示,得到新的特性表示序列; -4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; - -
-
-图6. SRL任务上的深层双向LSTM模型 -
- - -## 数据介绍 - -在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 - -原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中,我们使用test.wsj文件夹中的数据进行训练和测试,并只会用到words文件夹(文本序列)和props文件夹(标注结果)下的数据。本教程使用的数据目录如下: - -```text -conll05st-release/ -└── test.wsj - ├── props # 标注结果 - └── words # 输入文本序列 -``` - -标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同,但原理是相同的,关于标注结果标签含义的说明,请参考论文\[[9](#参考文献)\]。 - -原始数据需要进行数据预处理才能被PaddlePaddle处理,预处理包括下面几个步骤: - -1. 将文本序列和标记序列其合并到一条记录中; -2. 一个句子如果含有$n$个谓词,这个句子会被处理$n$次,变成$n$条独立的训练样本,每个样本一个不同的谓词; -3. 抽取谓词上下文和构造谓词上下文区域标记; -4. 构造以BIO法表示的标记; -5. 依据词典获取词对应的整数索引。 - -预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 - -| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | -|---|---|---|---|---| -| A | set | n't been set . × | 0 | B-A1 | -| record | set | n't been set . × | 0 | I-A1 | -| date | set | n't been set . × | 0 | I-A1 | -| has | set | n't been set . × | 0 | O | -| n't | set | n't been set . × | 1 | B-AM-NEG | -| been | set | n't been set . × | 1 | O | -| set | set | n't been set . × | 1 | B-V | -| . | set | n't been set . × | 1 | O | - - -除数据之外,我们同时提供了以下资源: - -| 文件名称 | 说明 | -|---|---| -| word_dict | 输入句子的词典,共计44068个词 | -| label_dict | 标记的词典,共计106个标记 | -| predicate_dict | 谓词的词典,共计3162个词 | -| emb | 一个训练好的词表,32维 | - -我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 - -获取词典,打印词典大小: - -```python -from __future__ import print_function - -import math, os -import numpy as np -import paddle -import paddle.v2.dataset.conll05 as conll05 -import paddle.fluid as fluid -import time - -with_gpu = os.getenv('WITH_GPU', '0') != '0' - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_dict_len = len(verb_dict) - -print('word_dict_len: ', word_dict_len) -print('label_dict_len: ', label_dict_len) -print('pred_dict_len: ', pred_dict_len) -``` - -## 模型配置说明 - -- 定义输入数据维度及模型超参数。 - -```python -mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 -word_dim = 32 # 词向量维度 -mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 -hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 -depth = 8 # 栈式LSTM的深度 -mix_hidden_lr = 1e-3 - -IS_SPARSE = True -PASS_NUM = 10 -BATCH_SIZE = 10 - -embedding_name = 'emb' -``` - -这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 - -- 如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 - -```python -# 这里加载PaddlePaddle上版保存的二进制模型 -def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) # skip header. - return np.fromfile(f, dtype=np.float32).reshape(h, w) -``` - -- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 - -```python -def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - **ignored): - # 8 features - predicate_embedding = fluid.layers.embedding( - input=predicate, - size=[pred_dict_len, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr='vemb') - - mark_embedding = fluid.layers.embedding( - input=mark, - size=[mark_dict_len, mark_dim], - dtype='float32', - is_sparse=IS_SPARSE) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - # Since word vector lookup table is pre-trained, we won't update it this time. - # trainable being False prevents updating the lookup table during training. - emb_layers = [ - fluid.layers.embedding( - size=[word_dict_len, word_dim], - input=x, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - # 8 LSTM units are trained through alternating left-to-right / right-to-left order - # denoted by the variable `reverse`. - hidden_0_layers = [ - fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') - for emb in emb_layers - ] - - hidden_0 = fluid.layers.sums(input=hidden_0_layers) - - lstm_0 = fluid.layers.dynamic_lstm( - input=hidden_0, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid') - - # stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - # In PaddlePaddle, state features and transition features of a CRF are implemented - # by a fully connected layer and a CRF layer seperately. The fully connected layer - # with linear activation learns the state features, here we use fluid.layers.sums - # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle: - # fluid.layers.linear_chain_crf only - # learns the transition features, which is a cost layer and is the last layer of the network. - # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence - # as the cost by given the input sequence and it requires the true tag sequence - # as target in the learning process. - - for i in range(1, depth): - mix_hidden = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') - ]) - - lstm = fluid.layers.dynamic_lstm( - input=mix_hidden, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid', - is_reverse=((i % 2) == 1)) - - input_tmp = [mix_hidden, lstm] - - # 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射, - # 经过一个全连接层映射到标记字典的维度,来学习 CRF 的状态特征 - feature_out = fluid.layers.sums(input=[ - fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), - fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') - ]) - - return feature_out -``` - -## 训练模型 - -- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 - -- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 - -- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。 - -- 可以使用event_handler回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是trainer.train函数里设定。 - -- 通过trainer.train函数训练 - -```python -def train(use_cuda, save_dirname=None, is_local=True): - # define network topology - - # 句子序列 - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词 - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下文5个特征 - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - - # 谓词上下区域标志 - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - # define network topology - feature_out = db_lstm(**locals()) - - # 标注序列 - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - - # 学习 CRF 的转移特征 - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=mix_hidden_lr)) - - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - - sgd_optimizer.minimize(avg_cost) - - # The CRF decoding layer is used for evaluation and inference. - # It shares weights with CRF layer. The sharing of parameters among multiple layers - # is specified by using the same parameter name in these layers. If true tag sequence - # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error - # for each input token and sums the error over the entire sequence. - # Otherwise, `fluid.layers.crf_decoding` generates the labelling tags. - crf_decode = fluid.layers.crf_decoding( - input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=BATCH_SIZE) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target - ], - place=place) - exe = fluid.Executor(place) - - def train_loop(main_program): - exe.run(fluid.default_startup_program()) - embedding_param = fluid.global_scope().find_var( - embedding_name).get_tensor() - embedding_param.set( - load_parameter(conll05.get_embedding(), word_dict_len, word_dim), - place) - - start_time = time.time() - batch_id = 0 - for pass_id in xrange(PASS_NUM): - for data in train_data(): - cost = exe.run(main_program, - feed=feeder.feed(data), - fetch_list=[avg_cost]) - cost = cost[0] - - if batch_id % 10 == 0: - print("avg_cost: " + str(cost)) - if batch_id != 0: - print("second per batch: " + str((time.time( - ) - start_time) / batch_id)) - # Set the threshold low to speed up the CI test - if float(cost) < 60.0: - if save_dirname is not None: - fluid.io.save_inference_model(save_dirname, [ - 'word_data', 'verb_data', 'ctx_n2_data', - 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', - 'ctx_p2_data', 'mark_data' - ], [feature_out], exe) - return - - batch_id = batch_id + 1 - - train_loop(fluid.default_main_program()) -``` - - -## 应用模型 - -训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。 - -```python -def infer(use_cuda, save_dirname=None): - if save_dirname is None: - return - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - - inference_scope = fluid.core.Scope() - with fluid.scope_guard(inference_scope): - # Use fluid.io.load_inference_model to obtain the inference program desc, - # the feed_target_names (the names of variables that will be fed - # data using feed operators), and the fetch_targets (variables that - # we want to obtain data from using fetch operators). - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) - - # Setup inputs by creating LoDTensors to represent sequences of words. - # Here each word is the basic element of these LoDTensors and the shape of - # each word (base_shape) should be [1] since it is simply an index to - # look up for the corresponding word vector. - # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], - # which has only one lod level. Then the created LoDTensors will have only - # one higher level structure (sequence of words, or sentence) than the basic - # element (word). Hence the LoDTensor will hold data for three sentences of - # length 3, 4 and 2, respectively. - # Note that lod info should be a list of lists. - lod = [[3, 4, 2]] - base_shape = [1] - # The range of random integers is [low, high] - word = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - pred = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=pred_dict_len - 1) - ctx_n2 = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - ctx_n1 = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - ctx_0 = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - ctx_p1 = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - ctx_p2 = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=word_dict_len - 1) - mark = fluid.create_random_int_lodtensor( - lod, base_shape, place, low=0, high=mark_dict_len - 1) - - # Construct feed as a dictionary of {feed_target_name: feed_target_data} - # and results will contain a list of data corresponding to fetch_targets. - assert feed_target_names[0] == 'word_data' - assert feed_target_names[1] == 'verb_data' - assert feed_target_names[2] == 'ctx_n2_data' - assert feed_target_names[3] == 'ctx_n1_data' - assert feed_target_names[4] == 'ctx_0_data' - assert feed_target_names[5] == 'ctx_p1_data' - assert feed_target_names[6] == 'ctx_p2_data' - assert feed_target_names[7] == 'mark_data' - - results = exe.run(inference_program, - feed={ - feed_target_names[0]: word, - feed_target_names[1]: pred, - feed_target_names[2]: ctx_n2, - feed_target_names[3]: ctx_n1, - feed_target_names[4]: ctx_0, - feed_target_names[5]: ctx_p1, - feed_target_names[6]: ctx_p2, - feed_target_names[7]: mark - }, - fetch_list=fetch_targets, - return_numpy=False) - print(results[0].lod()) - np_data = np.array(results[0]) - print("Inference Shape: ", np_data.shape) -``` - -整个程序的入口如下: - -```python -def main(use_cuda, is_local=True): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - - # Directory for saving the trained model - save_dirname = "label_semantic_roles.inference.model" - - train(use_cuda, save_dirname, is_local) - infer(use_cuda, save_dirname) - - -main(use_cuda=False) -``` - -## 总结 - -语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 - -## 参考文献 -1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. -2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013. -3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. -4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014. -5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289. -6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012. -7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330. -8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106. -9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164. -10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md b/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md deleted file mode 100644 index a27499c6ed8d1149c6d519006086febbcae943fa..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/learning_materials.md +++ /dev/null @@ -1,54 +0,0 @@ -# 学习资料 - -## 要读的第一本书 -基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分,这里推荐如下书籍辅助您。 - - -### 机器学习理论 - -在开启深度学习之前,您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支,两者内在的理论基础存在强关联。 -机器学习理论的书籍教材比较多,这里推荐一本易懂易学的书籍,可以重点关注神经网络部分。 - -书名:《机器学习》(周志华著,清华大学出版社,2016年版) - -### 深度学习理论 - -打好机器学习的理论功底后,您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象,且和数学结合紧密。 -为了让您能够顺利入门,这里推荐一份易学易用的教材,无论深度学习理论还是数学理论即可一本搞定。 - -书名:《Deep Learning(深度学习)》(Goodfellow, Bengio, Courville合著,赵申剑、黎彧君、符天凡和李凯合译,人民邮电出版社,2017年版) -此书电子版在Github上已经开源,详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese) - -### 编程语言 - -Python方向:这里推荐您学习Python,一方面各大主流深度学习框架的主力支撑编程语言均为Python;另一方面,对比其他语言,Python较为简单易学。 -Python的教材种类较多,这里推荐一本实操和理论性都兼顾的教材,只要完成书中52个习题,跑代码然后发现问题解决,就能逐步上手。 - -书名:《“笨办法”学Python》(Zed Shaw著,王巍巍译,人民邮电出版社,2014年11月版) - - -C++方向:C++语言在底层框架中使用较多,您逐步掌握开源框架的基本操作后,在更高阶的框架应用中会用到这个技能点。 -同前面提到的Python一样,学习C++时需要多上手操作。这里推荐迅速上手C++的书籍,不但能够学习功能和结构,还提供了解决方案的示例。 - -书名:《Essential C++》【美】李普曼(Lippman,S.B.)著,侯捷译,电子工业出版社2013年8月版 - - - -## 要看的视频公开课 - -在学习一门新技术的同时,除了看书,如果有老师面对面教授,可以更快更好的学会知识。相比于线下授课,视频公开课能够在省钱省力的同时,达到易学易掌握的效果。 -目前深度学习的课程多是公开免费的,通过学习您可以更轻松的理解深度学习中的抽象理论,并在实操方面不绕弯路。 -综合课程生动性、可操作性、紧凑性、连续性这些特点,这里推荐如下课程,同步附上网址,便于您查找学习。 - -### 理论知识详解视频课 -[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程,包含相关算法的详细讲解。 - -[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程,每节课在20-30分钟左右,从AI技术到深度学习进行全面细致的解读。 - -[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程,其中是英文课程,会结合国外的科研成果,但也适合新手入门和理解深度学习。 - -[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程,从基础到进阶操作都提供详细说明,每节课时长20分钟左右。 - -### PaddlePaddle实操视频课 -掌握好理论基础,具备编程能力后,您可以开始使用PaddlePaddle Fluid进行实操,从初阶开始学习,向着中高阶努力。 -目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程,帮助开发者从零开始使用PaddlePaddle,从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore deleted file mode 100644 index 6129b9e8645010fcb8372d9dc3dbb568dfa80907..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -data/wmt14 -data/pre-wmt14 -pretrained/wmt14_model -gen.log -gen_result -train.log -dataprovider_copy_1.py -*.pyc -multi-bleu.perl diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md deleted file mode 100644 index 6e5f77fec8a894c390ced8c93ee344fd8d27370e..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md +++ /dev/null @@ -1,472 +0,0 @@ -# 机器翻译 - -本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -机器翻译(machine translation, MT)是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言(source language),翻译成的结果语言称为目标语言(target language)。机器翻译即实现从源语言到目标语言转换的过程,是自然语言处理的重要研究领域之一。 - -早期机器翻译系统多为基于规则的翻译系统,需要由语言学家编写两种语言之间的转换规则,再将这些规则录入计算机。该方法对语言学家的要求非常高,而且我们几乎无法总结一门语言会用到的所有规则,更何况两种甚至更多的语言。因此,传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。 - -为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。 - -近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。 -
-
-图1. 基于神经网络的机器翻译系统 -
- -本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。 - -## 效果展示 - -以中英翻译(中文翻译到英文)的模型为例,当模型训练完毕时,如果输入如下已分词的中文句子: -```text -这些 是 希望 的 曙光 和 解脱 的 迹象 . -``` -如果设定显示翻译结果的条数(即[柱搜索算法](#柱搜索算法)的宽度)为3,生成的英语句子如下: -```text -0 -5.36816 These are signs of hope and relief . -1 -6.23177 These are the light of hope and relief . -2 -7.7914 These are the light of hope and the relief of hope . -``` - -- 左起第一列是生成句子的序号;左起第二列是该条句子的得分(从大到小),分值越高越好;左起第三列是生成的英语句子。 - -- 另外有两个特殊标志:``表示句子的结尾,``表示未登录词(unknown word),即未在训练字典中出现的词。 - -## 模型概览 - -本节依次介绍双向循环神经网络(Bi-directional Recurrent Neural Network),NMT模型中典型的编码器-解码器(Encoder-Decoder)框架以及柱搜索(beam search)算法。 - -### 双向循环神经网络 - -我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络,这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列,得到其在每个时刻的特征表示,即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。 - -具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。 - - -
-
-图2. 按时间步展开的双向循环神经网络 -
- -### 编码器-解码器框架 - -编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 -![encoder_decoder](./image/encoder_decoder.png) -
-
-图3. 编码器-解码器框架 -
- - -#### 编码器 - -编码阶段分为三步: - -1. one-hot vector表示:将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同,并且只有一个维度上有值1(该位置对应该词在词汇表中的位置),其余全是0。 - -2. 映射到低维语义空间的词向量:one-hot vector表示存在两个问题,1)生成的向量维度往往很大,容易造成维数灾难;2)难以刻画词与词之间的关系(如语义相似性,也就是无法很好地表达语义)。因此,需再one-hot vector映射到低维的语义空间,由一个固定维度的稠密向量(称为词向量)表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`,用`$s_i=Cw_i$`表示第`$i$`个词的词向量,`$K$`为向量维度。 - -3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。 - -第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。 -
-
-图4. 使用双向GRU的编码器 -
- -#### 解码器 - -机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: -1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: -$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ -其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 - -2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: -$$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ -其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分,再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。 - -3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。 - -4. 重复步骤1~3,直到目标语言序列中的所有词处理完毕。 - -机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)。 - - -### 柱搜索算法 - -柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`你好`”,就算目标语言字典中只有3个词(``, ``, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 - -柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。 - -使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是: -1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。 - -2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。 - -3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。 - -4. 重复步骤1~3,直到获得句子结束标记``或超过句子的最大生成长度为止。 - -注意:`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的,因此并不能保证得到全局最优解。 - -## 数据介绍 - -本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集,[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。 - -### 数据预处理 - -我们的预处理流程包括两步: - -- 将每个源语言到目标语言的平行语料库文件合并为一个文件: - -- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。 - -- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接,用'\t'分隔。 - -- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词,包括:语料中词频最高的(DICTSIZE - 3)个单词,和3个特殊符号``(序列的开始)、``(序列的结束)和``(未登录词)。 - -### 示例数据 - -因为完整的数据集数据量较大,为了验证训练流程,PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。 - -该数据集有193319条训练数据,6003条测试数据,词典长度为30000。因为数据规模限制,使用该数据集训练出来的模型效果无法保证。 - -## 模型配置说明 - -下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 - -```python -from __future__ import print_function -import contextlib - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.framework as framework -import paddle.fluid.layers as pd -from paddle.fluid.executor import Executor -from functools import partial -import os - -dict_size = 30000 -source_dict_dim = target_dict_dim = dict_size -hidden_dim = 32 -word_dim = 16 -batch_size = 2 -max_length = 8 -topk_size = 50 -beam_size = 2 - -decoder_size = hidden_dim -``` - -然后如下实现编码器框架: - - ```python - def encoder(is_sparse): - src_word_id = pd.data( - name="src_word_id", shape=[1], dtype='int64', lod_level=1) - src_embedding = pd.embedding( - input=src_word_id, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=is_sparse, - param_attr=fluid.ParamAttr(name='vemb')) - - fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') - lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) - encoder_out = pd.sequence_last_step(input=lstm_hidden0) - return encoder_out - ``` - -再实现训练模式下的解码器: - -```python - def train_decoder(context, is_sparse): - trg_language_word = pd.data( - name="target_language_word", shape=[1], dtype='int64', lod_level=1) - trg_embedding = pd.embedding( - input=trg_language_word, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=is_sparse, - param_attr=fluid.ParamAttr(name='vemb')) - - rnn = pd.DynamicRNN() - with rnn.block(): - current_word = rnn.step_input(trg_embedding) - pre_state = rnn.memory(init=context) - current_state = pd.fc(input=[current_word, pre_state], - size=decoder_size, - act='tanh') - - current_score = pd.fc(input=current_state, - size=target_dict_dim, - act='softmax') - rnn.update_memory(pre_state, current_state) - rnn.output(current_score) - - return rnn() -``` - -实现推测模式下的解码器: - -```python -def decode(context, is_sparse): - init_state = context - array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) - - # fill the first element with init_state - state_array = pd.create_array('float32') - pd.array_write(init_state, array=state_array, i=counter) - - # ids, scores as memory - ids_array = pd.create_array('int64') - scores_array = pd.create_array('float32') - - init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) - init_scores = pd.data( - name="init_scores", shape=[1], dtype="float32", lod_level=2) - - pd.array_write(init_ids, array=ids_array, i=counter) - pd.array_write(init_scores, array=scores_array, i=counter) - - cond = pd.less_than(x=counter, y=array_len) - - while_op = pd.While(cond=cond) - with while_op.block(): - pre_ids = pd.array_read(array=ids_array, i=counter) - pre_state = pd.array_read(array=state_array, i=counter) - pre_score = pd.array_read(array=scores_array, i=counter) - - # expand the lod of pre_state to be the same with pre_score - pre_state_expanded = pd.sequence_expand(pre_state, pre_score) - - pre_ids_emb = pd.embedding( - input=pre_ids, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=is_sparse) - - # use rnn unit to update rnn - current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], - size=decoder_size, - act='tanh') - current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) - # use score to do beam search - current_score = pd.fc(input=current_state_with_lod, - size=target_dict_dim, - act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=beam_size) - # calculate accumulated scores after topk to reduce computation cost - accu_scores = pd.elementwise_add( - x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) - selected_ids, selected_scores = pd.beam_search( - pre_ids, - pre_score, - topk_indices, - accu_scores, - beam_size, - end_id=10, - level=0) - - pd.increment(x=counter, value=1, in_place=True) - - # update the memories - pd.array_write(current_state, array=state_array, i=counter) - pd.array_write(selected_ids, array=ids_array, i=counter) - pd.array_write(selected_scores, array=scores_array, i=counter) - - # update the break condition: up to the max length or all candidates of - # source sentences have ended. - length_cond = pd.less_than(x=counter, y=array_len) - finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) - pd.logical_and(x=length_cond, y=finish_cond, out=cond) - - translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) - - return translation_ids, translation_scores -``` - -进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 - -```python -def train_program(is_sparse): - context = encoder(is_sparse) - rnn_out = train_decoder(context, is_sparse) - label = pd.data( - name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) - cost = pd.cross_entropy(input=rnn_out, label=label) - avg_cost = pd.mean(cost) - return avg_cost - - -def optimizer_func(): - return fluid.optimizer.Adagrad( - learning_rate=1e-4, - regularization=fluid.regularizer.L2DecayRegularizer( - regularization_coeff=0.1)) -``` - -## 训练模型 - -### 定义训练环境 -定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 - -```python -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=1000), - batch_size=batch_size) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -is_sparse = False -trainer = fluid.Trainer( - train_func=partial(train_program, is_sparse), - place=place, - optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。 - -```python -feed_order = [ - 'src_word_id', 'target_language_word', 'target_language_next_word' - ] -``` - -### 事件处理器 -回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - if event.step % 10 == 0: - print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) - - if event.step == 20: - trainer.stop() -``` - -### 开始训练 -最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 - -```python -EPOCH_NUM = 1 - -trainer.train( - reader=train_reader, - num_epochs=EPOCH_NUM, - event_handler=event_handler, - feed_order=feed_order) -``` - -## 应用模型 - -### 定义解码部分 - -使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数. - -```python -context = encoder(is_sparse) -translation_ids, translation_scores = decode(context, is_sparse) -``` - -### 定义数据 - -我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中,我们用`wmt14.test`数据中的第一个记录来做推测,最后我们用"源字典"和"目标字典"来列印对应的句子结果。 - -```python -init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') -init_scores_data = np.array( - [1. for _ in range(batch_size)], dtype='float32') -init_ids_data = init_ids_data.reshape((batch_size, 1)) -init_scores_data = init_scores_data.reshape((batch_size, 1)) -init_lod = [1] * batch_size -init_lod = [init_lod, init_lod] - -init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) -init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) - -test_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.test(dict_size), buf_size=1000), - batch_size=batch_size) - -feed_order = ['src_word_id'] -feed_list = [ - framework.default_main_program().global_block().var(var_name) - for var_name in feed_order -] -feeder = fluid.DataFeeder(feed_list, place) - -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) -``` - -### 测试 -现在我们可以进行预测了。我们要在`feed_order`提供对应参数,放在`executor`上运行以取得id和分数结果 - -```python -exe = Executor(place) -exe.run(framework.default_startup_program()) - -for data in test_data(): - feed_data = map(lambda x: [x[0]], data) - feed_dict = feeder.feed(feed_data) - feed_dict['init_ids'] = init_ids - feed_dict['init_scores'] = init_scores - - results = exe.run( - framework.default_main_program(), - feed=feed_dict, - fetch_list=[translation_ids, translation_scores], - return_numpy=False) - - result_ids = np.array(results[0]) - result_scores = np.array(results[1]) - - print("Original sentence:") - print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]])) - print("Translated score and sentence:") - for i in xrange(beam_size): - start_pos = result_ids_lod[1][i] + 1 - end_pos = result_ids_lod[1][i+1] - print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1], - " ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]]))) - - break -``` - -## 总结 - -端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中,我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq(Sequence to Sequence,序列到序列)学习问题,因此,Seq2Seq中的query改写(query rewriting)、摘要、单轮对话等问题都可以用本教程的模型来解决。 - -## 参考文献 - -1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009. -2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734. -3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014. -4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015. -5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore deleted file mode 100644 index f23901aeb3a9e7cd12611fc556742670d04a9bb5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.idea -.ipynb_checkpoints diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md deleted file mode 100644 index 4b79e62f74e587fcd939d9f9e911af80992ea6a3..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md +++ /dev/null @@ -1,537 +0,0 @@ -# 个性化推荐 - -本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。 - -## 背景介绍 - -在网络技术不断发展和电子商务规模不断扩大的背景下,商品数量和种类快速增长,用户需要花费大量时间才能找到自己想买的商品,这就是信息超载问题。为了解决这个难题,推荐系统(Recommender System)应运而生。 - -个性化推荐系统是信息过滤系统(Information Filtering System)的子集,它可以用在很多领域,如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为,发现用户的个性化需求与兴趣特点,将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同,推荐系统不需要用户准确地描述出自己的需求,而是根据分析历史行为建模,主动提供满足用户兴趣和需求的信息。 - -传统的推荐系统方法主要有: - -- 协同过滤推荐(Collaborative Filtering Recommendation):该方法收集分析用户历史行为、活动、偏好,计算一个用户与其他用户的相似度,利用目标用户的相似用户对商品评价的加权评价值,来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品;缺点是对于没有任何行为的新用户存在冷启动的问题,同时也存在用户与商品之间的交互数据不够多造成的稀疏问题,会导致模型难以找到相近用户。 -- 基于内容过滤推荐[[1](#参考文献)](Content-based Filtering Recommendation):该方法利用商品的内容描述,抽象出有意义的特征,通过计算用户的兴趣和商品描述之间的相似度,来给用户做推荐。优点是简单直接,不需要依据其他用户对商品的评价,而是通过商品属性进行商品相似度度量,从而推荐给用户所感兴趣商品的相似商品;缺点是对于没有任何行为的新用户同样存在冷启动的问题。 -- 组合推荐[[2](#参考文献)](Hybrid Recommendation):运用不同的输入和技术共同进行推荐,以弥补各自推荐技术的缺点。 - -其中协同过滤是应用最广泛的技术之一,它又可以分为多个子类:基于用户 (User-Based)的推荐[[3](#参考文献)] 、基于物品(Item-Based)的推荐[[4](#参考文献)]、基于社交网络关系(Social-Based)的推荐[[5](#参考文献)]、基于模型(Model-based)的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想,此后,基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。 - -深度学习具有优秀的自动提取特征的能力,能够学习多层次的抽象特征表示,并对异质或跨域的内容信息进行学习,可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型,以及如何使用PaddlePaddle实现模型。 - -## 效果展示 - -我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后,只需要输入对应的用户ID和电影ID,就可以得出一个匹配的分数(范围[0,5],分数越高视为兴趣越大),然后根据所有电影的推荐得分排序,推荐给用户可能感兴趣的电影。 - -``` -Input movie_id: 1962 -Input user_id: 1 -Prediction Score is 4.25 -``` - -## 模型概览 - -本章中,我们首先介绍YouTube的视频推荐系统[[7](#参考文献)],然后介绍我们实现的融合推荐模型。 - -### YouTube的深度神经网络推荐系统 - -YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示: - -

-
-图1. YouTube 推荐系统结构 -

- -#### 候选生成网络(Candidate Generation Network) - -候选生成网络将推荐问题建模为一个类别数极大的多类分类问题:对于一个Youtube用户,使用其观看历史(视频ID)、搜索词记录(search tokens)、人口学信息(如地理位置、用户登录设备)、二值特征(如性别,是否登录)和连续特征(如用户年龄)等,对视频库中所有视频进行多分类,得到每一类别的分类结果(即每一个视频的推荐概率),最终输出概率较高的几百个视频。 - -首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。 - -

-
-图2. 候选生成网络结构 -

- -对于一个用户$U$,预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为: - -$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$ - -其中$u$为用户$U$的特征表示,$V$为视频库集合,$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量,两者点积可以通过全连接层实现。 - -考虑到softmax分类的类别数非常多,为了保证一定的计算效率:1)训练阶段,使用负样本类别采样将实际计算的类别数缩小至数千;2)推荐(预测)阶段,忽略softmax的归一化计算(不影响结果),将类别打分问题简化为点积(dot product)空间中的最近邻(nearest neighbor)搜索问题,取与$u$最近的$k$个视频作为生成的候选。 - -#### 排序网络(Ranking Network) -排序网络的结构类似于候选生成网络,但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似,这里也构造了大量的用于视频排序的相关特征(如视频 ID、上次观看时间等)。这些特征的处理方式和候选生成网络类似,不同之处是排序网络的顶部是一个加权逻辑回归(weighted logistic regression),它对所有候选视频进行打分,从高到底排序后将分数较高的一些视频返回给用户。 - -### 融合推荐模型 -本节会使卷积神经网络(Convolutional Neural Networks)来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。 - -#### 文本卷积神经网络(CNN) - -卷积神经网络经常用来处理具有类似网格拓扑结构(grid-like topology)的数据。例如,图像可以视为二维网格的像素点,自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征,并对其进行组合抽象得到更高级的特征表示。实验表明,卷积神经网络能高效地对图像及文本问题进行建模处理。 - -卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解: - -

-
-图3. 卷积神经网络文本分类模型 -

- -假设待处理句子的长度为$n$,其中第$i$个词的词向量(word embedding)为$x_i\in\mathbb{R}^k$,$k$为维度大小。 - -首先,进行词向量的拼接操作:将每$h$个词拼接起来形成一个大小为$h$的词窗口,记为$x_{i:i+h-1}$,它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接,其中,$i$表示词窗口中第一个词在整个句子中的位置,取值范围从$1$到$n-h+1$,$x_{i:i+h-1}\in\mathbb{R}^{hk}$。 - -其次,进行卷积操作:把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$,得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$,其中$b\in\mathbb{R}$为偏置项(bias),$f$为非线性激活函数,如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$,产生一个特征图(feature map): - -$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$ - -接下来,对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征$\hat c$,它是特征图中所有元素的最大值: - -$$\hat c=max(c)$$ - -#### 模型概览 - -在融合推荐模型的电影推荐系统中: - -1. 首先,使用用户特征和电影特征作为神经网络的输入,其中: - - - 用户特征融合了四个属性信息,分别是用户ID、性别、职业和年龄。 - - - 电影特征融合了三个属性信息,分别是电影ID、电影类型ID和电影名称。 - -2. 对用户特征,将用户ID映射为维度大小为256的向量表示,输入全连接层,并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。 - -3. 对电影特征,将电影ID以类似用户ID的方式进行处理,电影类型ID以向量的形式直接输入全连接层,电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。 - -4. 得到用户和电影的向量表示后,计算二者的余弦相似度作为推荐系统的打分。最后,用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。 - -

- -
-图4. 融合推荐模型 -

- -## 数据准备 - -### 数据介绍与下载 - -我们以 [MovieLens 百万数据集(ml-1m)](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价(评分范围 1~5 分,均为整数),由 GroupLens Research 实验室搜集整理。 - -Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens` - - -```python -import paddle -movie_info = paddle.dataset.movielens.movie_info() -print movie_info.values()[0] -``` - - -```python -# Run this block to show dataset's documentation -# help(paddle.dataset.movielens) -``` - -在原始数据中包含电影的特征数据,用户的特征数据,和用户对电影的评分。 - -例如,其中某一个电影特征为: - - -```python -movie_info = paddle.dataset.movielens.movie_info() -print movie_info.values()[0] -``` - - - - -这表示,电影的id是1,标题是《Toy Story》,该电影被分为到三个类别中。这三个类别是动画,儿童,喜剧。 - - -```python -user_info = paddle.dataset.movielens.user_info() -print user_info.values()[0] -``` - - - - -这表示,该用户ID是1,女性,年龄比18岁还年轻。职业ID是10。 - - -其中,年龄使用下列分布 - -* 1: "Under 18" -* 18: "18-24" -* 25: "25-34" -* 35: "35-44" -* 45: "45-49" -* 50: "50-55" -* 56: "56+" - -职业是从下面几种选项里面选则得出: - -* 0: "other" or not specified -* 1: "academic/educator" -* 2: "artist" -* 3: "clerical/admin" -* 4: "college/grad student" -* 5: "customer service" -* 6: "doctor/health care" -* 7: "executive/managerial" -* 8: "farmer" -* 9: "homemaker" -* 10: "K-12 student" -* 11: "lawyer" -* 12: "programmer" -* 13: "retired" -* 14: "sales/marketing" -* 15: "scientist" -* 16: "self-employed" -* 17: "technician/engineer" -* 18: "tradesman/craftsman" -* 19: "unemployed" -* 20: "writer" - -而对于每一条训练/测试数据,均为 <用户特征> + <电影特征> + 评分。 - -例如,我们获得第一条训练数据: - - -```python -train_set_creator = paddle.dataset.movielens.train() -train_sample = next(train_set_creator()) -uid = train_sample[0] -mov_id = train_sample[len(user_info[uid].value())] -print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1]) -``` - - User rates Movie with Score [5.0] - - -即用户1对电影1193的评价为5分。 - -## 模型配置说明 - -下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 - - -```python -from __future__ import print_function -import math -import sys -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.layers as layers -import paddle.fluid.nets as nets - -IS_SPARSE = True -USE_GPU = False -BATCH_SIZE = 256 -``` - -然后为我们的用户特征综合模型定义模型配置 - -```python -def get_usr_combined_features(): - - USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 - - uid = layers.data(name='user_id', shape=[1], dtype='int64') - - usr_emb = layers.embedding( - input=uid, - dtype='float32', - size=[USR_DICT_SIZE, 32], - param_attr='user_table', - is_sparse=IS_SPARSE) - - usr_fc = layers.fc(input=usr_emb, size=32) - - USR_GENDER_DICT_SIZE = 2 - - usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64') - - usr_gender_emb = layers.embedding( - input=usr_gender_id, - size=[USR_GENDER_DICT_SIZE, 16], - param_attr='gender_table', - is_sparse=IS_SPARSE) - - usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) - - USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) - usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64") - - usr_age_emb = layers.embedding( - input=usr_age_id, - size=[USR_AGE_DICT_SIZE, 16], - is_sparse=IS_SPARSE, - param_attr='age_table') - - usr_age_fc = layers.fc(input=usr_age_emb, size=16) - - USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 - usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64") - - usr_job_emb = layers.embedding( - input=usr_job_id, - size=[USR_JOB_DICT_SIZE, 16], - param_attr='job_table', - is_sparse=IS_SPARSE) - - usr_job_fc = layers.fc(input=usr_job_emb, size=16) - - concat_embed = layers.concat( - input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) - - usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") - - return usr_combined_features -``` - -如上述代码所示,对于每个用户,我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便,我们借鉴NLP中的语言模型,将这几维离散的整数值,变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。 - -然后,我们对于所有的用户特征,均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。 - -进而,我们对每一个电影特征做类似的变换,网络配置为: - - -```python -def get_mov_combined_features(): - - MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 - - mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') - - mov_emb = layers.embedding( - input=mov_id, - dtype='float32', - size=[MOV_DICT_SIZE, 32], - param_attr='movie_table', - is_sparse=IS_SPARSE) - - mov_fc = layers.fc(input=mov_emb, size=32) - - CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) - - category_id = layers.data( - name='category_id', shape=[1], dtype='int64', lod_level=1) - - mov_categories_emb = layers.embedding( - input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) - - mov_categories_hidden = layers.sequence_pool( - input=mov_categories_emb, pool_type="sum") - - MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) - - mov_title_id = layers.data( - name='movie_title', shape=[1], dtype='int64', lod_level=1) - - mov_title_emb = layers.embedding( - input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) - - mov_title_conv = nets.sequence_conv_pool( - input=mov_title_emb, - num_filters=32, - filter_size=3, - act="tanh", - pool_type="sum") - - concat_embed = layers.concat( - input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) - - mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") - - return mov_combined_features -``` - -电影标题名称(title)是一个序列的整数,整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层,这个层会在时间维度上使用卷积和池化。因为如此,所以输出会是固定长度,尽管输入的序列长度各不相同。 - -最后,我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。 - -```python -def inference_program(): - usr_combined_features = get_usr_combined_features() - mov_combined_features = get_mov_combined_features() - - inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) - scale_infer = layers.scale(x=inference, scale=5.0) - - return scale_infer -``` - -进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 - -```python -def train_program(): - - scale_infer = inference_program() - - label = layers.data(name='score', shape=[1], dtype='float32') - square_cost = layers.square_error_cost(input=scale_infer, label=label) - avg_cost = layers.mean(square_cost) - - return [avg_cost, scale_infer] - - -def optimizer_func(): - return fluid.optimizer.SGD(learning_rate=0.2) -``` - - -## 训练模型 - -### 定义训练环境 -定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 - -```python -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.movielens.train(), buf_size=8192), - batch_size=BATCH_SIZE) - -test_reader = paddle.batch( - paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。 - -```python -feed_order = [ - 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', - 'movie_title', 'score' -] -``` - -### 事件处理器 -回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -# Specify the directory path to save the parameters -params_dirname = "recommender_system.inference.model" - -from paddle.v2.plot import Ploter -test_title = "Test cost" -plot_cost = Ploter(test_title) - - -def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - avg_cost_set = trainer.test( - reader=test_reader, feed_order=feed_order) - - # get avg cost - avg_cost = np.array(avg_cost_set).mean() - - plot_cost.append(test_title, event.step, avg_cost_set[0]) - plot_cost.plot() - - print("avg_cost: %s" % avg_cost) - print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1, - float(avg_cost))) - - if event.step == 20: # Adjust this number for accuracy - trainer.save_params(params_dirname) - trainer.stop() -``` - -### 开始训练 -最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 - -```python -trainer.train( - num_epochs=1, - event_handler=event_handler, - reader=train_reader, - feed_order=feed_order) -``` - -## 应用模型 - -### 构建预测器 -传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 - -```python -inferencer = fluid.Inferencer( - inference_program, param_path=params_dirname, place=place) -``` - -### 生成测试用输入数据 -使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列,每个元素是一个索引号的序列。`lod`是细节层次的信息,对应于`data`。比如,data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列,长度分别是3和2。于是相应地 lod = [[3, 2]],它表明其包含一层细节信息,意味着 `data` 有两个序列,长度分别是3和2。 - -在这个预测例子中,我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分 - -```python -infer_movie_id = 783 -infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title -user_id = fluid.create_lod_tensor([[1]], [[1]], place) -gender_id = fluid.create_lod_tensor([[1]], [[1]], place) -age_id = fluid.create_lod_tensor([[0]], [[1]], place) -job_id = fluid.create_lod_tensor([[10]], [[1]], place) -movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame -category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical -movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]], - place) # 'hunchback','of','notre','dame','the' -``` - -### 测试 -现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。 - - -```python -results = inferencer.infer( - { - 'user_id': user_id, - 'gender_id': gender_id, - 'age_id': age_id, - 'job_id': job_id, - 'movie_id': movie_id, - 'category_id': category_id, - 'movie_title': movie_title - }, - return_numpy=False) - -predict_rating = np.array(results[0]) -print("Predict Rating of user id 1 on movie \"" + infer_movie_name + "\" is " + str(predict_rating[0][0])) -print("Actual Rating of user id 1 on movie \"" + infer_movie_name + "\" is 4.") - -``` - -## 总结 - -本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统,并以电影推荐为例,使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面,而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术,也将会在推荐系统领域大放异彩。 - -## 参考文献 - -1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325. -2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2. -3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186. -4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001. -5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA -6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016). -7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198. - - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore deleted file mode 100644 index 667762d327cb160376a4119fa9df9db41b6443b2..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -data/aclImdb -data/imdb -data/pre-imdb -data/mosesdecoder-master -*.log -model_output -dataprovider_copy_1.py -model.list -*.pyc -.DS_Store diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md deleted file mode 100644 index 8477cf32146c33947ced447c8bdd287a3e1e71f5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md +++ /dev/null @@ -1,358 +0,0 @@ -# 情感分析 - -本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。 - -## 背景介绍 - -在自然语言处理中,情感分析一般是指判断一段文本所表达的情绪状态。其中,一段文本可以是一个句子,一个段落或一个文档。情绪状态可以是两类,如(正面,负面),(高兴,悲伤);也可以是三类,如(积极,消极,中性)等等。情感分析的应用场景十分广泛,如把用户在购物网站(亚马逊、天猫、淘宝等)、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论;或为了分析用户对于某一产品的整体使用感受,抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子: - -| 电影评论 | 类别 | -| -------- | ----- | -| 在冯小刚这几年的电影里,算最好的一部的了| 正面 | -| 很不好看,好像一个地方台的电视剧 | 负面 | -| 圆方镜头全程炫技,色调背景美则美矣,但剧情拖沓,口音不伦不类,一直努力却始终无法入戏| 负面| -|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉,看得实在太舒服了。。|正面| - -

表格 1 电影评论情感分析

- -在自然语言处理中,情感分析属于典型的**文本分类**问题,即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前,主流的文本表示方法为词袋模型BOW(bag of words),话题模型等等;分类方法有SVM(support vector machine), LR(logistic regression)等等。 - -对于一段文本,BOW表示会忽略其词顺序、语法和句法,将这段文本仅仅看做是一个词集合,因此BOW方法并不能充分表示文本的语义信息。例如,句子“这部电影糟糕透了”和“一个乏味,空洞,没有内涵的作品”在情感分析中具有很高的语义相似度,但是它们的BOW表示的相似度为0。又如,句子“一个空洞,没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高,但实际上它们的意思很不一样。 - -本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷,它在考虑词顺序的基础上把文本映射到低维度的语义空间,并且以端对端(end to end)的方式进行文本表示及分类,其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。 - -## 模型概览 -本章所使用的文本表示模型为卷积神经网络(Convolutional Neural Networks)和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。 - -### 文本卷积神经网络简介(CNN) - -我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程,这里进行一个简单的回顾。 - -对卷积神经网络来说,首先使用卷积处理输入的词向量序列,产生一个特征图(feature map),对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征,最后,将所有卷积核得到的特征拼接起来即为文本的定长向量表示,对于文本分类问题,将其连接至softmax即构建出完整的模型。在实际应用中,我们会使用多个卷积核来处理句子,窗口大小相同的卷积核堆叠起来形成一个矩阵,这样可以更高效的完成运算。另外,我们也可使用窗口大小不同的卷积核来处理句子,[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核,不同颜色表示不同大小的卷积核操作。 - -对于一般的短文本分类问题,上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示,可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。 - -### 循环神经网络(RNN) - -循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。 - -

-
-图1. 循环神经网络按时间展开的示意图 -

- -循环神经网络按时间展开后如图1所示:在第$t$时刻,网络读入第$t$个输入$x_t$(向量表示)及前一时刻隐层的状态值$h_{t-1}$(向量表示,$h_0$一般初始化为$0$向量),计算得出本时刻隐层的状态值$h_t$,重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$,则其公式可表示为: - -$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$ - -其中$W_{xh}$是输入到隐层的矩阵参数,$W_{hh}$是隐层到隐层的矩阵参数,$b_h$为隐层的偏置向量(bias)参数,$\sigma$为$sigmoid$函数。 - -在处理自然语言时,一般会先将词(one-hot表示)映射为其词向量(word embedding)表示,然后再作为循环神经网络每一时刻的输入$x_t$。此外,可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如,可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层(deep or stacked)循环神经网络,或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。 - -### 长短期记忆网络(LSTM) - -对于较长的序列数据,循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题,Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。 - -相比于简单的循环神经网络,LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$,则其公式为: - -$$ h_t=F(x_t,h_{t-1})$$ - -$F$由下列公式组合而成\[[7](#参考文献)\]: -$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$ -$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$ -$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$ -$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$ -$$ h_t = o_t\odot tanh(c_t) $$ -其中,$i_t, f_t, c_t, o_t$分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的$W$及$b$为模型参数,$tanh$为双曲正切函数,$\odot$表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元$c$的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元$c$,如图2所示: - -

-
-图2. 时刻$t$的LSTM [7] -

- -LSTM通过给简单的循环神经网络增加记忆及控制门的方式,增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\],其设计更为简洁一些。**这些改进虽然各有不同,但是它们的宏观描述却与简单的循环神经网络一样(如图2所示),即隐状态依据当前输入及前一时刻的隐状态来改变,不断地循环这一过程直至输入处理完毕:** - -$$ h_t=Recrurent(x_t,h_{t-1})$$ - -其中,$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。 - -### 栈式双向LSTM(Stacked Bidirectional LSTM) - -对于正常顺序的循环神经网络,$h_t$包含了$t$时刻之前的输入信息,也就是上文信息。同样,为了得到下文信息,我们可以使用反方向(将输入逆序处理)的循环神经网络。结合构建深层循环神经网络的方法(深层神经网络往往能得到更抽象和高级的特征表示),我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\],来对时序数据进行建模。 - -如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。 - -

-
-图3. 栈式双向LSTM用于文本分类 -

- - -## 数据集介绍 - -我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中,负面评论的得分小于等于4,正面评论的得分大于等于7,满分10分。 -```text -aclImdb -|- test - |-- neg - |-- pos -|- train - |-- neg - |-- pos -``` -Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取,并提供了读取字典、训练数据、测试数据等API。 - -## 配置模型 - -在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量: - -```python -from __future__ import print_function -import paddle -import paddle.fluid as fluid -from functools import partial -import numpy as np - -CLASS_DIM = 2 -EMB_DIM = 128 -HID_DIM = 512 -STACKED_NUM = 3 -BATCH_SIZE = 128 -USE_GPU = False -``` - - -### 文本卷积神经网络 -我们构建神经网络`convolution_net`,示例代码如下。 -需要注意的是:`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。 - -```python -def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): - emb = fluid.layers.embedding( - input=data, size=[input_dim, emb_dim], is_sparse=True) - conv_3 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=3, - act="tanh", - pool_type="sqrt") - conv_4 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=4, - act="tanh", - pool_type="sqrt") - prediction = fluid.layers.fc( - input=[conv_3, conv_4], size=class_dim, act="softmax") - return prediction -``` - -网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 - - - -### 栈式双向LSTM - -栈式双向神经网络`stacked_lstm_net`的代码片段如下: - -```python -def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): - - emb = fluid.layers.embedding( - input=data, size=[input_dim, emb_dim], is_sparse=True) - - fc1 = fluid.layers.fc(input=emb, size=hid_dim) - lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) - - inputs = [fc1, lstm1] - - for i in range(2, stacked_num + 1): - fc = fluid.layers.fc(input=inputs, size=hid_dim) - lstm, cell = fluid.layers.dynamic_lstm( - input=fc, size=hid_dim, is_reverse=(i % 2) == 0) - inputs = [fc, lstm] - - fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') - lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') - - prediction = fluid.layers.fc( - input=[fc_last, lstm_last], size=class_dim, act='softmax') - return prediction -``` -以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。 - -重申一下,此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。 - -接下来我们定义预测程序(`inference_program`)。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。 - -```python -def inference_program(word_dict): - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - - dict_dim = len(word_dict) - net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) - # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM) - return net -``` - -我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。 - -因为是有监督的学习,训练集的标签也在`paddle.layer.data`中定义了。在训练过程中,交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。 - -在测试过程中,分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。 - -```python -def train_program(word_dict): - prediction = inference_program(word_dict) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(cost) - accuracy = fluid.layers.accuracy(input=prediction, label=label) - return [avg_cost, accuracy] - - -def optimizer_func(): - return fluid.optimizer.Adagrad(learning_rate=0.002) -``` - -## 训练模型 - -### 定义训练环境 - -定义您的训练是在CPU上还是在GPU上: - - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 - -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据,乱序化的大小为缓存大小buf_size。 - -注意:读取IMDB的数据可能会花费几分钟的时间,请耐心等待。 - -```python -print("Loading IMDB word dict....") -word_dict = paddle.dataset.imdb.word_dict() - -print ("Reading training data....") -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=BATCH_SIZE) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -trainer = fluid.Trainer( - train_func=partial(train_program, word_dict), - place=place, - optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`imdb.train`产生的第一列的数据对应的是`words`这个特征。 - -```python -feed_order = ['words', 'label'] -``` - -### 事件处理器 - -回调函数event_handler在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -# Specify the directory path to save the parameters -params_dirname = "understand_sentiment_conv.inference.model" - -def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - print("Step {0}, Epoch {1} Metrics {2}".format( - event.step, event.epoch, map(np.array, event.metrics))) - - if event.step == 10: - trainer.save_params(params_dirname) - trainer.stop() -``` - -### 开始训练 - -最后,我们传入训练循环数(num_epoch)和一些别的参数,调用 trainer.train 来开始训练。 - -```python -trainer.train( - num_epochs=1, - event_handler=event_handler, - reader=train_reader, - feed_order=feed_order) -``` - -## 应用模型 - -### 构建预测器 - -传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 - -```python -inferencer = fluid.Inferencer( - infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place) -``` - -### 生成测试用输入数据 - -为了进行预测,我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词,则设为`unknown`。 -然后我们用`create_lod_tensor`来创建细节层次的张量。 - -```python -reviews_str = [ - 'read the book forget the movie', 'this is a great movie', 'this is very bad' -] -reviews = [c.split() for c in reviews_str] - -UNK = word_dict[''] -lod = [] -for c in reviews: - lod.append([word_dict.get(words, UNK) for words in c]) - -base_shape = [[len(c) for c in lod]] - -tensor_words = fluid.create_lod_tensor(lod, base_shape, place) -``` - -## 应用模型 - -现在我们可以对每一条评论进行正面或者负面的预测啦。 - -```python -results = inferencer.infer({'words': tensor_words}) - -for i, r in enumerate(results[0]): - print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'") - -``` - - -## 总结 - -本章我们以情感分析为例,介绍了使用深度学习的方法进行端对端的短文本分类,并且使用PaddlePaddle完成了全部相关实验。同时,我们简要介绍了两种文本处理模型:卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。 - - -## 参考文献 -1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014. -2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014. -3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016. -4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449. -5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780. -6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166. -7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013. -8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. -9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore deleted file mode 100644 index a620e0279c310d213d4e6d8e99e666962c11e352..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -data/train.list -data/test.list -data/simple-examples* diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md deleted file mode 100644 index 904d99fe2ffc9ead69a86c9763568a5c098348d5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md +++ /dev/null @@ -1,446 +0,0 @@ - -# 词向量 - -本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。 - -## 背景介绍 - -本章我们介绍词的向量表征,也称为word embedding。词向量是自然语言处理中常见的一个操作,是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。 - -在这些互联网服务里,我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较,我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。 -在这种方式里,每个词被表示成一个实数向量(one-hot vector),其长度为字典大小,每个维度对应一个字典里的每个词,除了这个词对应维度上的值是1,其他元素都是0。 - -One-hot vector虽然自然,但是用处有限。比如,在互联网广告系统里,如果用户输入的query是“母亲节”,而有一个广告的关键词是“康乃馨”。虽然按照常理,我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨;但是这两个词对应的one-hot vectors之间的距离度量,无论是欧氏距离还是余弦相似度(cosine similarity),由于其向量正交,都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是:每个词本身的信息量都太小。所以,仅仅给定两个词,不足以让我们准确判别它们是否相关。要想精确计算相关性,我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。 - -在机器学习领域里,各种“知识”被各种模型表示,词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量(embedding vector),如$embedding(母亲节) = [0.3, 4.2, -1.5, ...], embedding(康乃馨) = [0.2, 5.6, -2.3, ...]$。在这个映射到的实数向量表示中,希望两个语义(或用法)上相似的词对应的词向量“更像”,这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。 - -词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前,传统做法是统计一个词语的共生矩阵$X$。$X$是一个$|V| \times |V|$ 大小的矩阵,$X_{ij}$表示在所有语料中,词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数,$|V|$为词汇表的大小。对$X$做矩阵分解(如奇异值分解,Singular Value Decomposition \[[5](#参考文献)\]),得到的$U$即视为所有词的词向量: - -$$X = USV^T$$ - -但这样的传统做法有很多问题: - -1) 由于很多词没有出现,导致矩阵极其稀疏,因此需要对词频做额外处理来达到好的矩阵分解效果; - -2) 矩阵非常大,维度太高(通常达到$10^6 \times 10^6$的数量级); - -3) 需要手动去掉停用词(如although, a,...),不然这些频繁出现的词也会影响矩阵分解的效果。 - -基于神经网络的模型不需要计算存储一个在全语料上统计的大表,而是通过学习语义信息得到词向量,因此能很好地解决以上问题。在本章里,我们将展示基于神经网络训练词向量的细节,以及如何用PaddlePaddle训练一个词向量模型。 - - -## 效果展示 - -本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。 - -

-
- 图1. 词向量的二维投影 -

- -另一方面,我们知道两个向量的余弦值在$[-1,1]$的区间内:两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0,两个方向完全相反的向量余弦值为-1,即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度: - -``` - -please input two words: big huge -similarity: 0.899180685161 - -please input two words: from company -similarity: -0.0997506977351 - -``` - -以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[模型应用](#模型应用)中详细描述用法。 - - -## 模型概览 - -在这里我们介绍三个训练词向量的模型:N-gram模型,CBOW模型和Skip-gram模型,它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型,我们会先介绍语言模型的概念,并在之后的[训练模型](#训练模型)中,带大家用PaddlePaddle实现它。而后两个模型,是近年来最有名的神经元词向量模型,由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\],虽然它们很浅很简单,但训练效果很好。 - -### 语言模型 - -在介绍词向量模型之前,我们先来引入一个概念:语言模型。 -语言模型旨在为语句的联合概率函数$P(w_1, ..., w_T)$建模, 其中$w_i$表示句子中的第i个词。语言模型的目标是,希望模型对有意义的句子赋予大概率,对没意义的句子赋予小概率。 -这样的模型可以应用于很多领域,如机器翻译、语音识别、信息检索、词性标注、手写识别等,它们都希望能得到一个连续序列的概率。 以信息检索为例,当你在搜索“how long is a football bame”时(bame是一个医学名词),搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低,而与bame近似的,可能引起错误的词中,game会使该句生成的概率最大。 - -对语言模型的目标概率$P(w_1, ..., w_T)$,如果假设文本中每个词都是相互独立的,则整句话的联合概率可以表示为其中所有词语条件概率的乘积,即: - -$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$ - -然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型: - -$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$ - - - -### N-gram neural model - -在计算语言学中,n-gram是一种重要的文本表示方法,表示一个文本中连续的n个项。基于具体的应用场景,每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法,用n-gram训练语言模型时,一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。 - -Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型(Neural Network Language Model,NNLM)通过一个线性映射和一个非线性隐层连接,同时学习了语言模型和词向量,即通过学习大量语料得到词语的向量表达,通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难(curse of dimensionality),即训练和测试数据不同导致的模型不准。注意:由于“神经概率语言模型”说法较为泛泛,我们在这里不用其NNLM的本名,考虑到其具体做法,本文中称该模型为N-gram neural model。 - -我们在上文中已经讲到用条件概率建模语言模型,即一句话中第$t$个词的概率和该句话的前$t-1$个词相关。可实际上越远的词语其实对该词的影响越小,那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响,则有: - -$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$ - -给定一些真实语料,这些语料中都是有意义的句子,N-gram模型的优化目标则是最大化目标函数: - -$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$ - -其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。 - -

-
- 图2. N-gram神经网络模型 -

- -图2展示了N-gram神经网络模型,从下往上看,该模型分为以下几个部分: - - 对于每个样本,模型输入$w_{t-n+1},...w_{t-1}$, 输出句子第t个词为字典中`|V|`个词的概率。 - - 每个输入词$w_{t-n+1},...w_{t-1}$首先通过映射矩阵映射到词向量$C(w_{t-n+1}),...C(w_{t-1})$。 - - - 然后所有词语的词向量连接成一个大向量,并经过一个非线性映射得到历史词语的隐层表示: - - $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$ - - 其中,$x$为所有词语的词向量连接成的大向量,表示文本历史特征;$\theta$、$U$、$b_1$、$b_2$和$W$分别为词向量层到隐层连接的参数。$g$表示未经归一化的所有输出单词概率,$g_i$表示未经归一化的字典中第$i$个单词的输出概率。 - - - 根据softmax的定义,通过归一化$g_i$, 生成目标词$w_t$的概率为: - - $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$ - - - 整个网络的损失值(cost)为多类分类交叉熵,用公式表示为 - - $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ - - 其中$y_k^i$表示第$i$个样本第$k$类的真实标签(0或1),$softmax(g_k^i)$表示第i个样本第k类softmax输出的概率。 - - - -### Continuous Bag-of-Words model(CBOW) - -CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示: - -

-
- 图3. CBOW模型 -

- -具体来说,不考虑上下文的词语输入顺序,CBOW是用上下文词语的词向量的均值来预测当前词。即: - -$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$ - -其中$x_t$为第$t$个词的词向量,分类分数(score)向量 $z=U*context$,最终的分类$y$采用softmax,损失函数采用多类分类交叉熵。 - -### Skip-gram model - -CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。 - -

-
- 图4. Skip-gram模型 -

- -如上图所示,Skip-gram模型的具体做法是,将一个词的词向量映射到$2n$个词的词向量($2n$表示当前输入词的前后各$n$个词),然后分别通过softmax得到这$2n$个词的分类损失值之和。 - - -## 数据准备 - -### 数据介绍 - -本教程使用Penn Treebank (PTB)(经Tomas Mikolov预处理过的版本)数据集。PTB数据集较小,训练速度快,应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下: - -

- - - - - - - - - - - - - - - - -
训练数据验证数据测试数据
ptb.train.txtptb.valid.txtptb.test.txt
42068句3370句3761句
-

- - -### 数据预处理 - -本章训练的是5-gram模型,表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`,自动做数据的下载与预处理,方便大家使用。 - -预处理会把数据集中的每一句话前后加上开始符号``以及结束符号``。然后依据窗口大小(本教程中为5),从头到尾每次向右滑动窗口并生成一条数据。 - -如"I have a dream that one day" 一句提供了5条数据: - -```text - I have a dream -I have a dream that -have a dream that one -a dream that one day -dream that one day -``` - -最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 - - -## 编程实现 - -本配置的模型结构如下图所示: - -

-
- 图5. 模型配置中的N-gram神经网络模型 -

- -首先,加载所需要的包: - -```python -import paddle -import paddle.fluid as fluid -import numpy -from functools import partial -import math -import os -import sys -from __future__ import print_function -``` - -然后,定义参数: -```python -EMBED_SIZE = 32 # word vector dimension -HIDDEN_SIZE = 256 # hidden layer dimension -N = 5 # train 5-gram -BATCH_SIZE = 32 # batch size - -# can use CPU or GPU -use_cuda = os.getenv('WITH_GPU', '0') != '0' - -word_dict = paddle.dataset.imikolov.build_dict() -dict_size = len(word_dict) -``` - -不同于之前的PaddlePaddle v2版本,在新的Fluid版本里,我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`,我们就可以直接用它来构造 N-gram 神经网络。 - -- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏,我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。 - -```python -def inference_program(is_sparse): - first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') - second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') - third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') - fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64') - - embed_first = fluid.layers.embedding( - input=first_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=is_sparse, - param_attr='shared_w') - embed_second = fluid.layers.embedding( - input=second_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=is_sparse, - param_attr='shared_w') - embed_third = fluid.layers.embedding( - input=third_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=is_sparse, - param_attr='shared_w') - embed_fourth = fluid.layers.embedding( - input=fourth_word, - size=[dict_size, EMBED_SIZE], - dtype='float32', - is_sparse=is_sparse, - param_attr='shared_w') - - concat_embed = fluid.layers.concat( - input=[embed_first, embed_second, embed_third, embed_fourth], axis=1) - hidden1 = fluid.layers.fc(input=concat_embed, - size=HIDDEN_SIZE, - act='sigmoid') - predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') - return predict_word -``` - -- 基于以上的神经网络结构,我们可以如下定义我们的`训练`方法 - -```python -def train_program(is_sparse): - # The declaration of 'next_word' must be after the invoking of inference_program, - # or the data input order of train program would be [next_word, firstw, secondw, - # thirdw, fourthw], which is not correct. - predict_word = inference_program(is_sparse) - next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') - cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) - avg_cost = fluid.layers.mean(cost) - return avg_cost -``` - -- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集:`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中,读取器是一个Python的函数,每次调用,会读取下一条数据。它是一个Python的generator。 - -`paddle.batch` 会读入一个读取器,然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤,批次的训练情况。 - -```python -def optimizer_func(): - # Note here we need to choose more sophisticated optimizers - # such as AdaGrad with a decay rate. The normal SGD converges - # very slowly. - # optimizer=fluid.optimizer.SGD(learning_rate=0.001), - return fluid.optimizer.AdagradOptimizer( - learning_rate=3e-3, - regularization=fluid.regularizer.L2DecayRegularizer(8e-4)) - - -def train(use_cuda, train_program, params_dirname): - train_reader = paddle.batch( - paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) - test_reader = paddle.batch( - paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - # We output cost every 10 steps. - if event.step % 10 == 0: - outs = trainer.test( - reader=test_reader, - feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) - avg_cost = outs[0] - - print("Step %d: Average Cost %f" % (event.step, avg_cost)) - - # If average cost is lower than 5.8, we consider the model good enough to stop. - # Note 5.8 is a relatively high value. In order to get a better model, one should - # aim for avg_cost lower than 3.5. But the training could take longer time. - if avg_cost < 5.8: - trainer.save_params(params_dirname) - trainer.stop() - - if math.isnan(avg_cost): - sys.exit("got NaN loss, training failed.") - - trainer = fluid.Trainer( - train_func=train_program, - optimizer_func=optimizer_func, - place=place) - - trainer.train( - reader=train_reader, - num_epochs=1, - event_handler=event_handler, - feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) -``` - -- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下: - -```text -Step 0: Average Cost 7.337213 -Step 10: Average Cost 6.136128 -Step 20: Average Cost 5.766995 -... -``` - - -## 模型应用 -在模型训练后,我们可以用它做一些预测。 - -### 预测下一个词 -我们可以用我们训练过的模型,在得知之前的 N-gram 后,预测下一个词。 - -```python -def infer(use_cuda, inference_program, params_dirname=None): - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) - - # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word - # is simply an index to look up for the corresponding word vector and hence - # the shape of word (base_shape) should be [1]. The length-based level of - # detail (lod) info of each LoDtensor should be [[1]] meaning there is only - # one lod_level and there is only one sequence of one word on this level. - # Note that lod info should be a list of lists. - - data1 = [[211]] # 'among' - data2 = [[6]] # 'a' - data3 = [[96]] # 'group' - data4 = [[4]] # 'of' - lod = [[1]] - - first_word = fluid.create_lod_tensor(data1, lod, place) - second_word = fluid.create_lod_tensor(data2, lod, place) - third_word = fluid.create_lod_tensor(data3, lod, place) - fourth_word = fluid.create_lod_tensor(data4, lod, place) - - result = inferencer.infer( - { - 'firstw': first_word, - 'secondw': second_word, - 'thirdw': third_word, - 'fourthw': fourth_word - }, - return_numpy=False) - - print(numpy.array(result[0])) - most_possible_word_index = numpy.argmax(result[0]) - print(most_possible_word_index) - print([ - key for key, value in word_dict.iteritems() - if value == most_possible_word_index - ][0]) -``` - -在经历3分钟的短暂训练后,我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长,比如几个小时,那么我们会得到的下一个预测是 `workers`。 - -```text -[[0.00106646 0.0007907 0.00072041 ... 0.00049024 0.00041355 0.00084464]] -6 -a -``` - -整个程序的入口很简单: - -```python -def main(use_cuda, is_sparse): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - - params_dirname = "word2vec.inference.model" - - train( - use_cuda=use_cuda, - train_program=partial(train_program, is_sparse), - params_dirname=params_dirname) - - infer( - use_cuda=use_cuda, - inference_program=partial(inference_program, is_sparse), - params_dirname=params_dirname) - - -main(use_cuda=use_cuda, is_sparse=True) -``` - - -## 总结 -本章中,我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中,我们可以根据向量间的余弦夹角,来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中,训练好的词向量可以用来初始化模型,以得到更好的效果。在文档分类中,有了词向量之后,可以用聚类的方法将文档中同义词进行分组,也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。 - - -## 参考文献 -1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155. -2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201. -3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013. -4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605. -5. https://en.wikipedia.org/wiki/Singular_value_decomposition - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/index.rst b/doc/fluid/new_docs/beginners_guide/index.rst deleted file mode 100644 index e18933dcc0038129077a455892ddd785579f0003..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/index.rst +++ /dev/null @@ -1,15 +0,0 @@ -######## -新手入门 -######## - -.. todo:: - - 新手入门的导引文字,需要完善。 - -.. toctree:: - :maxdepth: 2 - - install/install_doc.rst - quick_start/index.rst - basics/index.rst - basics/learning_materials.md diff --git a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst deleted file mode 100644 index 18788d2eae048ac5120b0b7afd63cd784a235798..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst +++ /dev/null @@ -1,564 +0,0 @@ -.. _how_to_install: - -安装说明 -^^^^^^^^ - -若您的系统为Linux或Windows,您可以使用我们提供的安装包来安装PaddlePaddle。 - -对于MacOS系统,我们暂未提供安装包,您可以使用 **从源码编译** 的方式安装。 - - -.. _install_linux: - -在Linux安装PaddlePaddle --------- - -推荐您使用 `pip `_ -安装,它是Linux系统下最简单的安装方式。 - -注意事项: - -- PaddlePaddle Python API 依赖Python 2.7版本。 - -执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 - - .. code-block:: bash - - pip install paddlepaddle - -您可以通过指定版本号来安装其它版本,例如: - - .. code-block:: bash - - pip install paddlepaddle==0.13.0 - - -如果需要安装支持GPU的版本(cuda9.0_cudnn7_avx_openblas),需要执行: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -PaddlePaddle针对不同需求提供了更多版本的安装包,部分列表如下: - -================================= ======================================== -版本号 版本说明 -================================= ======================================== -paddlepaddle-gpu==0.14.0 使用CUDA 9.0和cuDNN 7编译的0.14.0版本 -paddlepaddle-gpu==0.14.0.post87 使用CUDA 8.0和cuDNN 7编译的0.14.0版本 -paddlepaddle-gpu==0.14.0.post85 使用CUDA 8.0和cuDNN 5编译的0.14.0版本 -paddlepaddle-gpu==0.13.0 使用CUDA 9.0和cuDNN 7编译的0.13.0版本 -paddlepaddle-gpu==0.12.0 使用CUDA 8.0和cuDNN 5编译的0.12.0版本 -paddlepaddle-gpu==0.11.0.post87 使用CUDA 8.0和cuDNN 7编译的0.11.0版本 -paddlepaddle-gpu==0.11.0.post8 使用CUDA 8.0和cuDNN 5编译的0.11.0版本 -paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版本 -================================= ======================================== - -您可以在 `Release History `_ -中找到paddlepaddle-gpu的各个发行版本。 - -如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl -安装包和c-api开发包并安装,您可以从下面的表格中找到需要的版本: - -如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: - -.. image:: paddleci.png - :scale: 50 % - :align: center - -.. csv-table:: 各个版本最新的whl包 - :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" - :widths: 1, 3, 3 - - "stable_cuda9.0_cudnn7", "`paddlepaddle_gpu-0.14.0-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0-cp27-cp27m-manylinux1_x86_64.whl `__" - "stable_cuda8.0_cudnn7", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27m-manylinux1_x86_64.whl `__" - "stable_cuda8.0_cudnn5", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27m-manylinux1_x86_64.whl `__" - "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - -.. _FAQ: - -安装常见问题和解决方法 -====================== - -- paddlepaddle*.whl is not a supported wheel on this platform. - -出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。 -请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准, -需要使用最新的pip (>9.0.0) 才可以安装。 - -可以使用下面的命令更新您的pip: - - .. code-block:: bash - - pip install --upgrade pip - -如果仍然存在问题,可以执行: - - .. code-block:: bash - - python -c "import pip; print(pip.pep425tags.get_supported())" - -获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包 -可以在 `这里 `_ 找到。 - -如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; -如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64, -可以重命名这个whl包为 manylinux1_x86_64 再安装。 - - -.. _install_windows: - -在Windows安装PaddlePaddle ------------------------------- -Windows系统需要通过Docker来使用PaddleaPaddle。Docker是一个虚拟容器,使用Docker可以简化复杂的环境配置工作。 - -我们提供了 `PaddlePaddle_Windows快速安装包 `_, -它能够帮助您安装Docker和PaddlePaddle。 - -* 安装包支持的系统:Windows7,Windows8的所有版本,Windows10的专业版、企业版。 - -* 如果您希望使用GPU提升训练速度,请使用Linux系统安装,Windows系统暂不支持。 - -.. _install_mac: - -在MacOS安装PaddlePaddle --------- - -对于MacOS系统,我们暂未提供pip安装方式,您可以使用 **源码编译** 的方式安装。 - -.. _others: - -其他安装方式 -------------- - -.. _source: -源码编译(使用Docker镜像) -========== - -.. _requirements: - -需要的软硬件 -""""""""""""" - -为了编译PaddlePaddle,我们需要 - -1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统 -2. Docker - -不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。 - -.. _build_step: - -编译方法 -""""""""""""" - -PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像可以在 `这里 `_ 找到。 - - -**I. 编译CPU-Only版本的PaddlePaddle,需要执行:** - -.. code-block:: bash - - # 1. 获取源码 - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - # 2. 执行如下命令下载最新版本的docker镜像 - docker run --name paddle-test -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash - # 3. 进入docker内执行如下命令编译CPU-Only的二进制安装包 - mkdir -p /paddle/build && cd /paddle/build - cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF - make -j$(nproc) - -**II. 编译GPU版本的PaddlePaddle,需要执行:** - -.. code-block:: bash - - # 1. 获取源码 - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - # 2. 安装nvidia-docker - apt-get install nvidia-docker - # 3. 执行如下命令下载支持GPU运行的docker容器 - nvidia-docker run --name paddle-test-gpu -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash - # 4. 进入docker内执行如下命令编译GPU版本的PaddlePaddle - mkdir -p /paddle/build && cd /paddle/build - cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF - make -j$(nproc) - -**注意事项:** - -* 上述有关 :code:`docker` 的命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。 -* 进入 :code:`docker` 后执行 :code:`cmake` 命令,若是出现 :code:`patchelf not found, please install it.` 错误,则执行 :code:`apt-get install -y patchelf` 命令即可解决问题。 -* 若您在使用Docker编译PaddlePaddle遇到问题时, `这个issue `_ 可能会对您有所帮助。 - - -.. _source: -源码编译(不使用Docker镜像) -========== - -如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `附录:编译依赖`_ 之后才能开始编译的步骤。 - -.. _build_step: - -编译方法 -""""""""""""" - -在本机上编译CPU-Only版本的PaddlePaddle,需要执行如下命令: - -.. code-block:: bash - - # 1. 使用virtualenvwrapper创建python虚环境并将工作空间切换到虚环境 [可选] - mkvirtualenv paddle-venv - workon paddle-venv - # 2. 获取源码 - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - # 3. 执行下面的命令编译CPU-Only的二进制 - mkdir build && cd build - cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF - make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译 - - -**注意事项:** - -* MacOS系统下因为默认安装了cblas库,所以编译时可能会遇到 :code:`use of undeclared identifier 'openblas_set_num_threads'` 错误。因此,在执行cmake命令时需要指定所使用openblas库的头文件路径,具体操作如下: - - .. code-block:: bash - - cd Paddle/build && rm -rf * - cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DOPENBLAS_INC_DIR=/usr/local/Cellar/openblas/[本机所安装的openblas版本号]/include/ - make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译 -* 若您在MacOS系统下从源码编译PaddlePaddle遇到问题时, `这个issue `_ 可能会对您有所帮助。 - -编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: - -.. code-block:: bash - - pip install build/python/dist/*.whl - -如果机器中已经安装过PaddlePaddle,有两种方法: - -.. code-block:: bash - - 1. 先卸载之前的版本,再重新安装 - pip uninstall paddlepaddle - pip install build/python/dist/*.whl - - 2. 直接升级到更新的版本 - pip install build/python/dist/*.whl -U - -.. _run_test: - -执行单元测试 -""""""""""""" - -如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: - -设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 -开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build - -如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ): - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build - cd /paddle/build - ctest -R test_sum_op -V - -.. _faq_docker: - -常见问题 -""""""""""""" - -- 什么是 Docker? - - 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。 - -- Docker 还是虚拟机? - - 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。 - -- 为什么用 Docker? - - 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。 - - 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。 - -- 可以选择不用Docker吗? - - 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。 - -- 学习 Docker 有多难? - - 理解 Docker 并不难,大概花十分钟看一下 `这篇文章 `_。 - 这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 - -- 可以用 IDE 吗? - - 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。 - - 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行 - - .. code-block:: bash - - (global-set-key "\C-cc" 'compile) - (setq compile-command - "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") - - 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。 - -- 可以并行编译吗? - - 是的。我们的 Docker image 运行一个 `Bash 脚本 `_。这个脚本调用 :code:`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 - -- Docker 需要 sudo - - 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。 - -- 在 Windows/MacOS 上编译很慢 - - Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `这个issue `_。 - -- 磁盘不够 - - 本文中的例子里, :code:`docker run` 命令里都用了 :code:`--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 :code:`docker ps -a` 命令看到停止后但是没有删除的 containers。 :code:`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `这篇文章 `_ 来清理这些内容。 - - -.. _compile_deps: - -附录:编译依赖 -""""""""""""" - -PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。 - -.. csv-table:: PaddlePaddle编译依赖 - :header: "依赖", "版本", "说明" - :widths: 10, 15, 30 - - "CMake", "3.4", "" - "GCC", "4.8.2", "推荐使用CentOS的devtools2" - "Python", "2.7.x", "依赖libpython2.7.so" - "SWIG", ">=2.0", "" - "wget","","" - "openblas","","" - "pip", ">=9.0", "" - "numpy", "", "" - "protobuf","3.1.0","" - "wheel","","" - "Go", ">=1.8", "可选" - - -.. _build_options: - -附录:编译选项 -""""""""""""" - -PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。 -用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 -`官方文档 `_ 。 - -在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如: - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: 编译选项说明 - :header: "选项", "说明", "默认值" - :widths: 1, 7, 2 - - "WITH_GPU", "是否支持GPU", "ON" - "WITH_C_API", "是否仅编译CAPI", "OFF" - "WITH_DOUBLE", "是否使用双精度浮点数", "OFF" - "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON" - "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" - "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" - "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" - "WITH_TESTING", "是否开启单元测试", "OFF" - "WITH_DOC", "是否编译中英文文档", "OFF" - "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" - "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF" - "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON" - -BLAS -+++++ - -PaddlePaddle支持 `MKL `_ 和 -`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, -还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 - -如果关闭MKL,则会使用OpenBLAS作为BLAS库。 - -CUDA/cuDNN -+++++++++++ - -PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 -使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。 - -PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 -我们推荐使用最新版本的cuDNN。 - -编译选项的设置 -++++++++++++++ - -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 - -.. code-block:: bash - - cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 - -注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录( :code:`rm -rf` )后,再指定。 - -.. _install_docker: - -使用Docker安装运行 -================== - -使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境。 -您可以在 `Docker官网 `_ -获得基本的Docker安装和使用方法。 - -在了解Docker的基本使用方法之后,即可开始下面的步骤: - -.. _docker_pull: - -获取PaddlePaddle的Docker镜像 -"""""""""""""""""""""""""""" - -执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl: - - .. code-block:: bash - - docker pull paddlepaddle/paddle - -对于国内用户,我们提供了加速访问的镜像源: - - .. code-block:: bash - - docker pull docker.paddlepaddlehub.com/paddle - -下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:latest-gpu - docker pull docker.paddlepaddlehub.com/paddle:latest-gpu - -选择下载使用不同的BLAS库的Docker镜像: - - .. code-block:: bash - - # 默认是使用MKL的镜像 - docker pull paddlepaddle/paddle - # 使用OpenBLAS的镜像 - docker pull paddlepaddle/paddle:latest-openblas - -下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:[tag] - # 比如: - docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu - -.. _docker_run: - -在Docker中执行PaddlePaddle训练程序 -""""""""""""""""""""""""""""""""""" - -假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 -`PaddlePaddleBook `_ -编写),就可以使用下面的命令开始执行训练: - - .. code-block:: bash - - cd /home/work - docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - -上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work` -指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work` -目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py` -为容器内执行的命令,即运行训练程序。 - -当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: - - .. code-block:: bash - docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash - cd /work - python train.py - -**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。** - -.. _docker_run_book: - -使用Docker启动PaddlePaddle Book教程 -"""""""""""""""""""""""""""""""""""" - -使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 -PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 -如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 -大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。 - -我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: - - .. code-block:: bash - - docker run -p 8888:8888 paddlepaddle/book - -国内用户可以使用下面的镜像源来加速访问: - - .. code-block: bash - - docker run -p 8888:8888 docker.paddlepaddlehub.com/book - -然后在浏览器中输入以下网址: - - .. code-block:: text - - http://localhost:8888/ - -就这么简单,享受您的旅程! - -.. _docker_run_gpu: - -使用Docker执行GPU训练 -"""""""""""""""""""""""""""" - -为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用 -`nvidia-docker `_ 来运行镜像。 -请不要忘记提前在物理机上安装GPU最新驱动。 - - .. code-block:: bash - - nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash - -**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:** - - .. code-block:: bash - - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - -**关于AVX:** - -AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 -是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独 -`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。 - -以下指令能检查Linux电脑是否支持AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - -如果输出是No,就需要选择使用no-AVX的镜像 diff --git a/doc/fluid/new_docs/beginners_guide/install/paddleci.png b/doc/fluid/new_docs/beginners_guide/install/paddleci.png deleted file mode 100644 index 16087ce059aa3c07ce8c927d983eb86351915825..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/install/paddleci.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md deleted file mode 100644 index 9574dbea2f9a39bb196b61bb4fd12ba7c378f75a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md +++ /dev/null @@ -1,288 +0,0 @@ -# 线性回归 -让我们从经典的线性回归(Linear Regression \[[1](#参考文献)\])模型开始这份教程。在这一章里,你将使用真实的数据集建立起一个房价预测模型,并且了解到机器学习中的若干重要概念。 - -本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。 - -## 背景介绍 -给定一个大小为$n$的数据集 ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$,其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值,$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述,即 - -$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldots,n$$ - -例如,在我们将要建模的房价预测问题里,$x_{ij}$是描述房子$i$的各种属性(比如房间的个数、周围学校和医院的个数、交通状况等),而 $y_i$是房屋的价格。 - -初看起来,这个假设实在过于简单了,变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点,它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。 - -## 效果展示 -我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。 -

-
- 图1. 预测值 V.S. 真实值 -

- -## 模型概览 - -### 模型定义 - -在波士顿房价数据集中,和房屋相关的值共有14个:前13个用来描述房屋相关的各种信息,即模型中的 $x_i$;最后一个值为我们要预测的该类房屋价格的中位数,即模型中的 $y_i$。因此,我们的模型就可以表示成: - -$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$ - -$\hat{Y}$ 表示模型的预测结果,用来和真实值$Y$区分。模型要学习的参数即:$\omega_1, \ldots, \omega_{13}, b$。 - -建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。 - -对于线性回归模型来讲,最常见的损失函数就是均方误差(Mean Squared Error, [MSE](https://en.wikipedia.org/wiki/Mean_squared_error))了,它的形式是: - -$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ - -即对于一个大小为$n$的测试集,$MSE$是$n$个数据预测结果误差平方的均值。 - -### 训练过程 - -定义好模型结构之后,我们要通过以下几个步骤进行模型训练 - 1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。 - 2. 网络正向传播计算网络输出和损失函数。 - 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。 - 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。 - -## 数据集 - -### 数据集介绍 -这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下: - -| 属性名 | 解释 | 类型 | -| ------| ------ | ------ | -| CRIM | 该镇的人均犯罪率 | 连续值 | -| ZN | 占地面积超过25,000平方呎的住宅用地比例 | 连续值 | -| INDUS | 非零售商业用地比例 | 连续值 | -| CHAS | 是否邻近 Charles River | 离散值,1=邻近;0=不邻近 | -| NOX | 一氧化氮浓度 | 连续值 | -| RM | 每栋房屋的平均客房数 | 连续值 | -| AGE | 1940年之前建成的自用单位比例 | 连续值 | -| DIS | 到波士顿5个就业中心的加权距离 | 连续值 | -| RAD | 到径向公路的可达性指数 | 连续值 | -| TAX | 全值财产税率 | 连续值 | -| PTRATIO | 学生与教师的比例 | 连续值 | -| B | 1000(BK - 0.63)^2,其中BK为黑人占比 | 连续值 | -| LSTAT | 低收入人群占比 | 连续值 | -| MEDV | 同类房屋价格的中位数 | 连续值 | - -### 数据预处理 -#### 连续值与离散值 -观察一下数据,我们的第一个发现是:所有的13维属性中,有12维的连续值和1维的离散值(CHAS)。离散值虽然也常使用类似0、1、2这样的数字表示,但是其含义与连续值是不同的,因为这里的差值没有实际意义。例如,我们用0、1、2来分别表示红色、绿色和蓝色的话,我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有$d$个可能取值的离散属性,我们会将它们转为$d$个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言,因为CHAS本身就是一个二值属性,就省去了这个麻烦。 - -#### 属性的归一化 -另外一个稍加观察即可发现的事实是,各维属性的取值范围差别很大(如图2所示)。例如,属性B的取值范围是[0.32, 396.90],而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化(normalization)了。归一化的目标是把各位属性的取值范围放缩到差不多的区间,例如[-0.5,0.5]。这里我们使用一种很常见的操作方法:减掉均值,然后除以原取值范围。 - -做归一化(或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling))至少有以下3个理由: -- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。 -- 不同的数值范围会导致不同属性对模型的重要性不同(至少在训练的初始阶段如此),而这个隐含的假设常常是不合理的。这会对优化的过程造成困难,使训练时间大大的加长。 -- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。 - -

-
- 图2. 各维属性的取值范围 -

- -#### 整理训练集与测试集 -我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$ - - -在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。 - -## 训练 - -`fit_a_line/trainer.py`演示了训练的整体过程。 - -### 配置数据提供器(Datafeeder) -首先我们引入必要的库: -```python -import paddle -import paddle.fluid as fluid -import numpy -from __future__ import print_function -``` - -我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) - -其中,在uci_housing模块中封装了: - -1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。 -2. [数据预处理](#数据预处理)的过程。 - -接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性,她可以同时定义一个批次大小和一个缓存大小。这样的话,每次数据提供器会从缓存中随机读取批次大小那么多的数据。 - -```python -BATCH_SIZE = 20 - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) - -test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.test(), buf_size=500), - batch_size=BATCH_SIZE) -``` - -### 配置训练程序 -训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲,它就是一个从输入到输出的简单的全连接层。更加复杂的结果,比如卷积神经网络,递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值,因为它会被后面反向传播算法所用到。 - -```python -def train_program(): - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - - # feature vector of length 13 - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - - loss = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(loss) - - return avg_loss -``` - -### Optimizer Function 配置 - -在下面的 `SGD optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 - -```python -def optimizer_program(): - return fluid.optimizer.SGD(learning_rate=0.001) -``` - -### 定义运算场所 -我们可以定义运算是发生在CPU还是GPU - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 创建训练器 -训练器会读入一个训练程序和一些必要的其他参数: - -```python -trainer = fluid.Trainer( - train_func=train_program, - place=place, - optimizer_func=optimizer_program) -``` - -### 开始提供数据 -PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据,因此我们需要一个Python的list来定义读取顺序。 - -```python -feed_order=['x', 'y'] -``` - -除此之外,可以定义一个事件相应器来处理类似`打印训练进程`的事件: - -```python -# Specify the directory to save the parameters -params_dirname = "fit_a_line.inference.model" - -# Plot data -from paddle.v2.plot import Ploter -train_title = "Train cost" -test_title = "Test cost" -plot_cost = Ploter(train_title, test_title) - -step = 0 - -# event_handler prints training and testing info -def event_handler_plot(event): - global step - if isinstance(event, fluid.EndStepEvent): - if step % 10 == 0: # record a train cost every 10 batches - plot_cost.append(train_title, step, event.metrics[0]) - - if step % 100 == 0: # record a test cost every 100 batches - test_metrics = trainer.test( - reader=test_reader, feed_order=feed_order) - plot_cost.append(test_title, step, test_metrics[0]) - plot_cost.plot() - - if test_metrics[0] < 10.0: - # If the accuracy is good enough, we can stop the training. - print('loss is less than 10.0, stop') - trainer.stop() - step += 1 - - if isinstance(event, fluid.EndEpochEvent): - if event.epoch % 10 == 0: - # We can save the trained parameters for the inferences later - if params_dirname is not None: - trainer.save_params(params_dirname) -``` - -### 开始训练 -我们现在可以通过调用`trainer.train()`来开始训练 - -```python -%matplotlib inline - -# The training could take up to a few minutes. -trainer.train( - reader=train_reader, - num_epochs=100, - event_handler=event_handler_plot, - feed_order=feed_order) -``` -
-
-图3 训练结果 -
- - -## 预测 -提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。 - -### 设定预测程序 -类似于`trainer.train`,预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。 - - -```python -def inference_program(): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - return y_predict -``` - -### 预测 -预测器会从`params_dirname`中读取已经训练好的模型,来对从未遇见过的数据进行预测。 - -```python -inferencer = fluid.Inferencer( - infer_func=inference_program, param_path=params_dirname, place=place) - -batch_size = 10 -test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size) -test_data = test_reader().next() -test_x = numpy.array([data[0] for data in test_data]).astype("float32") -test_y = numpy.array([data[1] for data in test_data]).astype("float32") - -results = inferencer.infer({'x': test_x}) - -print("infer results: (House Price)") -for idx, val in enumerate(results[0]): - print("%d: %.2f" % (idx, val)) - -print("\nground truth:") -for idx, val in enumerate(test_y): - print("%d: %.2f" % (idx, val)) -``` - -## 总结 -在这章里,我们借助波士顿房价这一数据集,介绍了线性回归模型的基本概念,以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来,因此弄清楚线性模型的原理和局限非常重要。 - - -## 参考文献 -1. https://en.wikipedia.org/wiki/Linear_regression -2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001. -3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012. -4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst b/doc/fluid/new_docs/beginners_guide/quick_start/index.rst deleted file mode 100644 index f5889ba52b8016596108de48bad59f238c16afc0..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/quick_start/index.rst +++ /dev/null @@ -1,13 +0,0 @@ -######## -快速入门 -######## - -.. todo:: - - 概述 - -.. toctree:: - :maxdepth: 2 - - fit_a_line/README.cn.md - recognize_digits/README.cn.md diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md deleted file mode 100644 index ac36c4ecf6b9b716fe5f0dbe2346e64918c22242..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md +++ /dev/null @@ -1,447 +0,0 @@ -# 识别数字 - -本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。 - -## 背景介绍 -当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。 - -

-
-图1. MNIST图片示例 -

- -MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。 - -Yann LeCun早先在手写字符识别上做了很多研究,并在研究过程中提出了卷积神经网络(Convolutional Neural Network),大幅度地提高了手写字符的识别能力,也因此成为了深度学习领域的奠基人之一。如今的深度学习领域,卷积神经网络占据了至关重要的地位,从最早Yann LeCun提出的简单LeNet,到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等(请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程),人们在图像分类领域,利用卷积神经网络得到了一系列惊人的结果。 - -有很多算法在MNIST上进行实验。1998年,LeCun分别用单层线性分类器、多层感知器(Multilayer Perceptron, MLP)和多层卷积神经网络LeNet进行实验,使得测试集上的误差不断下降(从12%下降到0.7%)\[[1](#参考文献)\]。此后,科学家们又基于K近邻(K-Nearest Neighbors)算法\[[2](#参考文献)\]、支持向量机(SVM)\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验,并采用多种预处理方法(如去除歪曲、去噪、模糊等)来提高识别的准确率。 - -本教程中,我们从简单的模型Softmax回归开始,带大家入门手写字符识别,并逐步进行模型优化。 - - -## 模型概览 - -基于MNIST数据训练一个分类器,在介绍本教程使用的三个基本图像分类网络前,我们先给出一些定义: -- $X$是输入:MNIST图片是$28\times28$ 的二维图像,为了进行计算,我们将其转化为$784$维向量,即$X=\left ( x_0, x_1, \dots, x_{783} \right )$。 -- $Y$是输出:分类器的输出是10类数字(0-9),即$Y=\left ( y_0, y_1, \dots, y_9 \right )$,每一维$y_i$代表图片分类为第$i$类数字的概率。 -- $L$是图片的真实标签:$L=\left ( l_0, l_1, \dots, l_9 \right )$也是10维,但只有一维为1,其他都为0。 - -### Softmax回归(Softmax Regression) - -最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征,然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。 - -输入层的数据$X$传到输出层,在激活操作之前,会乘以相应的权重 $W$ ,并加上偏置变量 $b$ ,具体如下: - -$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$ - -其中 $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $ - -对于有 $N$ 个类别的多分类问题,指定 $N$ 个输出节点,$N$ 维结果向量经过softmax将归一化为 $N$ 个[0,1]范围内的实数值,分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。 - -在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy loss),公式如下: - -$$ L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$ - -图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 - -

-
-图2. softmax回归网络结构图
-

- -### 多层感知器(Multilayer Perceptron, MLP) - -Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。 - -1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。 -2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。 -3. 最后,再经过输出层,得到的$Y=\text{softmax}(W_3H_2 + b_3)$,即为最后的分类结果向量。 - - -图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 - -

-
-图3. 多层感知器网络结构图
-

- -### 卷积神经网络(Convolutional Neural Network, CNN) - -在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。 - -

-
-图4. LeNet-5卷积神经网络结构
-

- -#### 卷积层 - -卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。 - -

-
-图5. 卷积层图片
-

- -图5给出一个卷积计算过程的示例图,输入图像大小为$H=5,W=5,D=3$,即$5 \times 5$大小的3通道(RGB,也称作深度)彩色图像。这个示例图中包含两(用$K$表示)组卷积核,即图中滤波器$W_0$和$W_1$。在卷积计算中,通常对不同的输入通道采用不同的卷积核,如图示例中每组卷积核包含($D=3)$个$3 \times 3$(用$F \times F$表示)大小的卷积核。另外,这个示例中卷积核在图像的水平方向($W$方向)和垂直方向($H$方向)的滑动步长为2(用$S$表示);对输入图像周围各填充1(用$P$表示)个0,即图中输入层原始数据为蓝色部分,灰色部分是进行了大小为1的扩展,用0来进行扩展。经过卷积操作得到输出为$3 \times 3 \times 2$(用$H_{o} \times W_{o} \times K$表示)大小的特征图,即$3 \times 3$大小的2通道特征图,其中$H_o$计算公式为:$H_o = (H - F + 2 \times P)/S + 1$,$W_o$同理。 而输出特征图中的每个像素,是每组滤波器与输入图像每个特征图的内积再求和,再加上偏置$b_o$,偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。 - -在卷积操作中卷积核是可学习的参数,经过上面示例介绍,每层卷积的参数大小为$D \times F \times F \times K$。在多层感知器模型中,神经元通常是全部连接,参数较多。而卷积层的参数较少,这也是由卷积层的主要特性即局部连接和共享权重所决定。 - -- 局部连接:每个神经元仅与输入神经元的一块区域连接,这块局部区域称作感受野(receptive field)。在图像卷积操作中,即神经元在空间维度(spatial dimension,即上图示例H和W所在的平面)是局部连接,但在深度上是全部连接。对于二维图像本身而言,也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想,也是受启发于生物学里面的视觉系统结构,视觉皮层的神经元就是局部接受信息的。 - -- 权重共享:计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算$o[:,:,0]$的每个每个神经元的滤波器均相同,都为$W_0$,这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的,例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的,比如输入的图片是人脸,眼睛和头发位于不同的位置,希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的,在卷积层,通常采用多组卷积核提取不同特征,即对应不同深度切片的特征,不同深度切片的神经元权重是不共享。另外,偏重对同一深度切片的所有神经元都是共享的。 - -通过介绍卷积计算过程及其特性,可以看出卷积是线性操作,并具有平移不变性(shift-invariant),平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小,这样也有利于训练较大卷积神经网络。 - -#### 池化层 - -

-
-图6. 池化层图片
-

- -池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 - -更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。 - -### 常见激活函数介绍 -- sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ - -- tanh激活函数: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $ - - 实际上,tanh函数只是规模变化的sigmoid函数,将sigmoid函数值放大2倍之后再向下平移1个单位:tanh(x) = 2sigmoid(2x) - 1 。 - -- ReLU激活函数: $ f(x) = max(0, x) $ - -更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。 - -## 数据介绍 - -PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下: - - -| 文件名称 | 说明 | -|----------------------|-------------------------| -|train-images-idx3-ubyte| 训练数据图片,60,000条数据 | -|train-labels-idx1-ubyte| 训练数据标签,60,000条数据 | -|t10k-images-idx3-ubyte | 测试数据图片,10,000条数据 | -|t10k-labels-idx1-ubyte | 测试数据标签,10,000条数据 | - -## Fluid API 概述 - -演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。 -我们建议使用 Fluid API,因为它更容易学起来。 - -下面是快速的 Fluid API 概述。 -1. `inference_program`:指定如何从数据输入中获得预测的函数。 -这是指定网络流的地方。 - -1. `train_program`:指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。 -这是指定损失计算的地方。 - -1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。 - -1. `Trainer`:PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。 -通过 `event_handler` 回调函数,用户可以监控培训的进展。 - -1. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。 -然后,它可以推断数据和返回预测。 - -在这个演示中,我们将深入了解它们。 - -## 配置说明 -加载 PaddlePaddle 的 Fluid API 包。 - -```python -import paddle -import paddle.fluid as fluid -from __future__ import print_function -``` - -### Program Functions 配置 - -我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器,每个分类器都定义为 Python 函数。 -我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。 -让我们创建一个数据层来读取图像并将其连接到分类网络。 - -- Softmax回归:只通过一层简单的以softmax为激活函数的全连接层,就可以得到分类的结果。 - -```python -def softmax_regression(): - img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') - predict = fluid.layers.fc( - input=img, size=10, act='softmax') - return predict -``` - -- 多层感知器:下面代码实现了一个含有两个隐藏层(即全连接层)的多层感知器。其中两个隐藏层的激活函数均采用ReLU,输出层的激活函数用Softmax。 - -```python -def multilayer_perceptron(): - img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') - # 第一个全连接层,激活函数为ReLU - hidden = fluid.layers.fc(input=img, size=200, act='relu') - # 第二个全连接层,激活函数为ReLU - hidden = fluid.layers.fc(input=hidden, size=200, act='relu') - # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 - prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') - return prediction -``` - -- 卷积神经网络LeNet-5: 输入的二维图像,首先经过两次卷积层到池化层,再经过全连接层,最后使用以softmax为激活函数的全连接层作为输出层。 - -```python -def convolutional_neural_network(): - img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') - # 第一个卷积-池化层 - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=img, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) - # 第二个卷积-池化层 - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 - prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') - return prediction -``` - -#### Train Program 配置 -然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。 -在训练期间,它将从预测中计算 `avg_cost`。 - -**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 - -请随意修改代码,测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。 - -```python -def train_program(): - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - # predict = softmax_regression() # uncomment for Softmax回归 - # predict = multilayer_perceptron() # uncomment for 多层感知器 - predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络 - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) - acc = fluid.layers.accuracy(input=predict, label=label) - return [avg_cost, acc] - - -``` - -#### Optimizer Function 配置 - -在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 - -```python -def optimizer_program(): - return fluid.optimizer.Adam(learning_rate=0.001) -``` - -### 数据集 Feeders 配置 - -下一步,我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。 - -下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。 - -`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。 - -```python -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=500), - batch_size=64) - -test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=64) -``` - -### Trainer 配置 - -现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。 - -```python -# 该模型运行在单个CPU上 -use_cuda = False # set to True if training with GPU -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - -trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer_func=optimizer_program) -``` - -#### Event Handler 配置 - -Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。 -我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ,看看有什么不同。 - -`event_handler` 用来在训练过程中输出训练结果 - -```python -# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer -params_dirname = "recognize_digits_network.inference.model" -lists = [] -def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - if event.step % 100 == 0: - # event.metrics maps with train program return arguments. - # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. - print("Pass %d, Batch %d, Cost %f" % ( - event.step, event.epoch, event.metrics[0])) - - if isinstance(event, fluid.EndEpochEvent): - avg_cost, acc = trainer.test( - reader=test_reader, feed_order=['img', 'label']) - - print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc)) - - # save parameters - trainer.save_params(params_dirname) - lists.append((event.epoch, avg_cost, acc)) -``` - -`event_handler_plot` 可以用来在训练过程中画图如下: - -
-
-图7 训练结果 -
- - -```python -from paddle.v2.plot import Ploter - -train_title = "Train cost" -test_title = "Test cost" -cost_ploter = Ploter(train_title, test_title) -step = 0 -lists = [] - -# event_handler to plot a figure -def event_handler_plot(event): - global step - if isinstance(event, fluid.EndStepEvent): - if step % 100 == 0: - # event.metrics maps with train program return arguments. - # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. - cost_ploter.append(train_title, step, event.metrics[0]) - cost_ploter.plot() - step += 1 - if isinstance(event, fluid.EndEpochEvent): - # save parameters - trainer.save_params(params_dirname) - - avg_cost, acc = trainer.test( - reader=test_reader, feed_order=['img', 'label']) - cost_ploter.append(test_title, step, avg_cost) - lists.append((event.epoch, avg_cost, acc)) -``` - -#### 开始训练 - -既然我们设置了 `event_handler` 和 `data reader`,我们就可以开始训练模型了。 - -`feed_order` 用于将数据目录映射到 `train_program` - -```python -trainer.train( - num_epochs=5, - event_handler=event_handler, - reader=train_reader, - feed_order=['img', 'label']) -``` - -训练过程是完全自动的,event_handler里打印的日志类似如下所示: - -``` -Pass 0, Batch 0, Cost 0.125650 -Pass 100, Batch 0, Cost 0.161387 -Pass 200, Batch 0, Cost 0.040036 -Pass 300, Batch 0, Cost 0.023391 -Pass 400, Batch 0, Cost 0.005856 -Pass 500, Batch 0, Cost 0.003315 -Pass 600, Batch 0, Cost 0.009977 -Pass 700, Batch 0, Cost 0.020959 -Pass 800, Batch 0, Cost 0.105560 -Pass 900, Batch 0, Cost 0.239809 -Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338 -``` - -训练之后,检查模型的预测准确度。用 MNIST 训练的时候,一般 softmax回归模型的分类准确率为约为 92.34%,多层感知器为97.66%,卷积神经网络可以达到 99.20%。 - - -## 应用模型 - -可以使用训练好的模型对手写体数字图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。 - -### Inference 配置 - -`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 -我们可以简单地插入在此之前定义的分类器。 - -```python -inferencer = fluid.Inferencer( - # infer_func=softmax_regression, # uncomment for softmax regression - # infer_func=multilayer_perceptron, # uncomment for MLP - infer_func=convolutional_neural_network, # uncomment for LeNet5 - param_path=params_dirname, - place=place) -``` - -### 生成预测输入数据 - -`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。 - -```python -# Prepare the test image -import os -import numpy as np -from PIL import Image -def load_image(file): - im = Image.open(file).convert('L') - im = im.resize((28, 28), Image.ANTIALIAS) - im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32) - im = im / 255.0 * 2.0 - 1.0 - return im - -cur_dir = cur_dir = os.getcwd() -img = load_image(cur_dir + '/image/infer_3.png') -``` - -### 预测 - -现在我们准备做预测。 - -```python -results = inferencer.infer({'img': img}) -lab = np.argsort(results) # probs and lab are the results of one batch data -print ("Inference result of image/infer_3.png is: %d" % lab[0][0][-1]) -``` - -## 总结 - -本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型,后续章节中复杂的神经网络都是从它们衍生出来的,因此这几个模型对之后的学习大有裨益。同时,我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候,MNIST数据集上的识别准确率有了大幅度的提升,原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候,希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外,本教程还介绍了PaddlePaddle模型搭建的基本流程,从dataprovider的编写、网络层的构建,到最后的训练和预测。对这个流程熟悉以后,大家就可以用自己的数据,定义自己的网络模型,并完成自己的训练和预测任务了。 - -## 参考文献 - -1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324. -2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014). -3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190. -4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003. -5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007. -6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220. -7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010. -8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009. -9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386. -10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/faq/faq.rst b/doc/fluid/new_docs/faq/faq.rst deleted file mode 100644 index 3b4bd4f895162fa3b0ba12e785e38ad694590b25..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/faq/faq.rst +++ /dev/null @@ -1,12 +0,0 @@ -################### -编译安装与单元测试 -################### - -1. 通过pip安装的PaddlePaddle在 :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so` ------------------------------------------------------------------------------------------- -出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`, -但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` -拷贝到 :code:`/usr/local/lib` 路径下,所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下, -即: :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。 - -**注意**:如果是在虚拟环境中安装PaddlePaddle, :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。 diff --git a/doc/fluid/new_docs/faq/index_cn.rst b/doc/fluid/new_docs/faq/index_cn.rst deleted file mode 100644 index bb2ed99217609d3a9edd179d4f98ad5b8b649860..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/faq/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -FAQ -==== - -本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处,请您到 `PaddlePaddle社区 `_ 查找答案或直接提 `issue `_ ,我们会及时进行回复。 - -.. toctree:: - :maxdepth: 1 - - faq.rst diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst deleted file mode 100644 index 55c3c761f932713ffa2b462b35f9f46a8edae536..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_basic_concept.rst +++ /dev/null @@ -1,392 +0,0 @@ -================================ -PaddleFluid设计思想和基本使用概念 -================================ - - - -Paddle Fluid 是用来让用户像 PyTorch 和 Tensorflow Eager Execution 一样执行程序。 -在这些系统中,不再有模型这个概念,应用也不再包含一个用于描述 Operator 图或者一系列层的符号描述, -而是像通用程序那样描述训练或者预测的过程。 - - -深度学习平台的演化 -================ - -时至今日,深度学习已成为事实上最流行的机器学习技术。学术界多年研究加上工业界的长期实践提出了若干有效的基本建模单元: -全连接,卷积,循环神经网络等;设计各类训练技巧:初始化方法,跨层连接,各类 norm 技术等; -发明了各种新的优化算法:Adadelta,Adam 等; -各类固定的网络结构:highway, residual, attention 等纷纷涌现,不胜枚举。 -学术界工业界多年的付出共同促成了深度学习方法今日的影响力。 - -学术研究和生产实践中积累了大量的知识,能够很好的解释神经网络中基本模块各自独的学习能力和特性。 -基本模块和训练技术的组合能够搭建出千变万化的神经网络模型。 -基本模块和训练技术是有限的,但他们的组合却是千变万化,这是深度学习方法的魅力所在,也是难度所在。 - -正是这样高度的模块化特性,研究者和工程师们都在努力避免重复造轮子以提高研究和生产的效率, -又进一步催生了深度学习平台技术的发展,深度学习框架已演变成为 AI 基础设施中重要的一部分。 -从 Theano,到 DistBelief,到 TensorFlow;从 Caffe 到 Caffe2; -从 Torch 到 PyTorch;从 PaddlePaddle 到 PaddleFluid, -深度学习平台技术也经历了两代的演化,并向着第三代平台技术迈进。 - -站在历史发展的今天,当我们准备切换尝试使用一个新的深度学习平台作为支持自己学习和研究的工具时, -平台技术都发生了哪些演化,能够为我们的带来什么便利呢? - -先让我们来看看深度学习框架解决的三大问题: - -- 如何描述计算以支持未来潜在会出现的新模型? -- 如何高效利用异构设备最大化算力? -- 如何利用网络中的计算机进行分布式计算来处理千万亿级别的数据? - -以上三个问题中的第一个和使用者研究者最为密切相关。 -这篇文章我们通过分析 PaddleFluid的设计理念, -来了解一个深度学习框架如何抽象深度学习模型,来看看我们的使用经验如何在不同深度学习平台之间过度和迁移。 - -如何描述计算 -============= - -让我们首先来看看 PaddleFluid 如何描述机器学习模型 - - -PaddleFluid之 :code:`Program` - -如何描述计算很大程度决定了一个神经网络框架计算功能的完备性。 -深度学习模型和方法历经二十多年的发展:“依次执行一组计算的前向, -再以和前向计算相反的顺序执行反向计算,中间无分支无交互”, -这样的模型结构已经无法满足研究者和千千万万框架使用者的想象力。 - -从 `PaddleFluid 的设计目标 `_ 来看, -在如何描述机器学习模型这一核心问题上,PaddleFluid 的目标是: -创造一种新的计算描述方式,不但能够描述至今为止人们已知的主流神经网络模型,并且能够支持未来会出现的任意模型。 - -PaddleFluid 是如何做到支持未来出现的新模型这一目标呢?PaddleFluid 的设计选择是: -对用户来说,用一段 :code:`Program` (在 PaddleFluid 内部会被转化为一种叫作 :code:`ProgramDesc` 的描述语言), -而不是用计算图来描述机器学习模型。 :code:`Program` 用符合用户使用直觉的方式, -提供一种新的描述语言能够描述任意复杂的机器学习模型。 - -对所有计算机专业同学学习编程语言的第一课一定是建立对“程序语言的三种执行结构:顺序执行,条件选择和循环执行”的认识。 -计算机世界的所有可计算逻辑都是由这三种执行结构表示,用这三种结构描述的逻辑是可计算的。那么同样道理, -对一个神经网络框架来说,如果可以和程序语言一样提供对这三种执行结构的支持,那么将可以描述任意复杂的, -可被计算机计算的机器学习模型。PaddleFluid通过提供对这三种执行结构的支持,来做到对任意复杂模型的描述。 - -具体来说: - -1. Fluid 的核心设计理念都可以类比到程序语言,如果已经有写程序的经验,那么使用 Fluid 构建神经网络模型的体验,将非常接近写程序; - -2. 在 PaddleFluid 中,用户不会显示地感知“计算图”这样的概念,一个机器学习模型被描述为一个 Fluid :code:`Program` (Fluid 内部称之为 :code:`ProgramDesc` ); - -- 一个 Fluid :code:`Program` 由一组嵌套的 :code:`Block` 构成。 :code:`Block` 的概念可以类比到 C++ 或是 Java 中的一对大括号,或是 Python 语言中的一个缩进快; -- :code:`Block` 中的计算由顺序执行、条件选择或者循环执行三种方式组合,构成复杂的计算逻辑。 - -3. Fluid :code:`Program` 中包含对计算和计算对象的描述。计算的描述称之为 Operator;计算作用的对象(或者说 Operator 的输入和输出)被统一为 Tensor。 - -在描述计算和计算的作用对象这一问题上,各个深度学习框架的选择是相同的,如果有一个平台的使用经验,那么将非常容易在各个平台之间进行迁移。 - -核心使用概念 -============= - -下面,我们将更详细地了解核心使用概念在PaddlePaddle的使用方法。 - -数据表示和计算的对象:Tensor --------------------------- - -Tensor 是向量矩阵概念的扩展,是神经网络模型计算操作的基本对象。这在是今天所有主流深度学习平台的共同选择。 - -可以简单地将 Tensor 理解为一个 N 维向量,它可以有任意多的维度。一个 Tensor 具有两个基本特征: - -1. 数据类型:每个 Tensor 的所有元素具有同样的、已知的数据类型; - -2. 大小(或者说形状):即维度的个数(rank,阶)以及各维度的长度。 - -Tensor 某些维度的长度在定义模型阶段可能是未知的,在实际算法执行时才能确定。例如一个 mini-batch 中包含的样本数目(batch size),或者是一个 mini-batch 中序列的最大长度。 - -PaddleFluid中的Tensor -"""""""""""""""""""""" - -PaddleFluid 中也使用 Tensor 作为神经网络中输入输出数据的统一表示。Tensor 的概念在今天主流的深度学习平台中都是完全相同,可以在各个深度学习框架之间直接无缝迁移。 - -在 Fluid 中也同样存在三种特殊的 Tensor: - -1. 模型中的可学习参数 - -模型中的可学习参数生存期和整个训练任务一样长,会接受优化算法的更新。在 PaddleFluid 中同样以 :code:`Variable` 表示; -用户在绝大多数情况下都不需要自己来创建网络中的可学习参数,Fluid 为几乎常见的神经网络基本计算模块都提供了封装。 -以最简单的全连接模型为例,下面的代码片段会直接为全连接层创建连接权值 WW 和偏置( :code:`bias` )两个可学习参数, -无需显示地调用 variable 相关接口创建可学习参数。 - - -:: - - import paddle.fluid as fluid - - y = fluid.layers.fc(input=x, size=128, bias_attr=True) - -2. 输入输出Tensor - -整个神经网络的输入数据也是一个特殊的 Tensor,在这个 Tensor 中, -一些维度的大小在定义模型时无法确定(通常包括:batch size; -如果 mini-batch 之间,数据可变,也会包括序列的最大长度,图片的宽度和高度等),在定义模型时需要占位; -PaddleFluid 中使用 :code:`fluid.layers.data` 来接入输入数据, :code:`fluid.layer.data` 需要提供输入 Tensor 的 形状信息, -当遇到无法确定的维度 时, 相应维度指定为 None ,如下面的代码片段所示: - -:: - - import paddle.fluid as fluid - - x = fluid.layers.data(name="x", shape=[2, None, 3], dtype="int64") - -3. 常量 Tensor 在 PaddleFluid 中需要通过组合 Tensor 和 :code:`fluid.layers.assign` 来实现。 - - -计算原语:Operation/Operator ----------------------------- - -Tensor 是今天所有主流深度学习框架的统一数据表示(输入、输出、中间计算结果、模型的可学习参数都是 Tensor)。 -另一方面,对数据的操作,在主流深度学习框架中也高度统一为:Operator/Operation。 -在中文中,通常我们会习惯将其称之为算子。 - -注:在 PaddleFluid 中使用 Operator 称呼对 Tensor 的操作。 - -Operation/Operator 接受多个 Tensor 作为输入,输出若干个 Tensor,表示了从输入到输出的变化。 - -PaddleFluid中的Operator -"""""""""""""""""""""""" - -PaddleFluid 支持的所有算子,可以在 `API 帮助文档 `_ 中查看。 - -为了便于用户使用,在 Python 端,Fluid 中的 Operator 被进一步封装入 :code:`paddle.fluid.layers` , -:code:`paddle.fluid.networks` 等模块。这是因为:一些常见的对Tensor的操作可能是有更多基础操作构成, -例如:l2 norm 内部由 reduce、elementwise_add,scale 等多个 Operator 组合计算逻辑完成, -为了提高使用的便利性,框架内部对基础 Operator 进行了一些封装,包括创建 Operator 依赖可学习参数, -可学习参数的初始化细节等,减少用户重复开发的成本。 - -对所有深度学习框架都面临同样的封装,在绝大多数情况下,用户很少会直接与框架底层的 Operator 直接打交道,而是使用框架提供的 layers,networks 等模块,降低开发的代码量。不论是什么样的概念,他们在各框架之间的本质和作用都是相同的:对 Tensor 的变换。 - -总结 ->>>>>> - -不论叫作 Operation、Operator 还是 layers,他们在各深度学习平台中的含义和作用都是相同的:对 Tensor 的变换。是一个深度学习平台提供的基础计算能力。可以在每个平台各自的 API 帮助文档中查到。 - -在各个深度学习平台都已加入 ONNX 项目的今天,每个深度学习平台提供给大家的基本算子都已趋同,与此同时,每个平台也各有其特点,会提供一些独特的算子,方便某一类任务的开发。 - -构建模型并执行 --------------- - -整个训练任务运行方法如下: - -Fluid中的Program和Executor -""""""""""""""""""""""""""" - -1. Fluid 使用 :code:`Program` 描述神经网络模型,对用户来说,并没有计算图的概念。 -用户定义的所有 Tensor 以及对 Tensor 的操作:Operator 都会被加入一段 :code:`Program` 中; - -一段 Program 由嵌套的 :code:`Block` 构成,但用户无需显示地创建 :code:`Block` 或是显示地注意到 :code:`Block` 的存在; -在 Fluid 程序中, :code:`Block` 是在调用 :code:`while_op` , :code:`if_op` , :code:`parallel_do` 等特殊 :code:`Operator` 时,由这些 :code:`Operator` 来创建; -对用户使用来说,只需要知道自己正在向一段 Fluid Program 中添加变量( :code:`Tensor` )和操作( :code:`Operator` )即可。 - -2. Fluid 利用 :code:`Executor` 来执行一段 Fluid :code:`Program` 。 - -为进一步理解 Fluid 中 :code:`Executor` 的作用,需要先解释一下 Fluid 程序的执行流程。 下图展示单机上,Fluid 程序的执行流程: - -.. figure:: fluid_local_train.jpeg - - :scale: 50% - :align: center - - Figure.1 - - Fluid本地训练任务执行流程图 - -1. Fluid 设计思想和灵感非常类似于程序设计语言,和高级编译语言 C++/Java 编写程序的过程非常类似,Fluid 程序执行分为两个重要阶段:编译时和运行时; - -2. 编译期,用户通过调用 Fluid 提供的算子,向一段 :code:`Program` 中添加变量(Tensor)以及对变量的操作(Operators 或者 Layers)。用户只需要描述核心的前向计算,不需要关心反向计算,分布式下,异构设备下如何计算; - -3. 原始的 :code:`Program` 在平台内部转换为中间描述语言: :code:`ProgramDesc` ; - -4. 编译期最重要的一个功能模块是 Transpiler。Transpiler 接受一段 :code:`ProgramDesc` ,输出一段变化后的 :code:`ProgramDesc` ,作为后端 Executor 最终需要执行的 :code:`Fluid Program` ; - -最为常用的 Transipler 包括: - -1. 内存优化 Transipler:通过对变量读写依赖关系分析,插入内存回收 Operator 以维持运行过程中较小的内存开销; - -2. 分布式环境下的 Transpiler:接受用户定义的 local Program ,生成 Parameter Client 和 Parameter Server 执行的两段 :code:`Program` 。 - -3. 后端 Executor 接受 Transpiler 输出的这段 :code:`Program` ,依次执行其中的 Operator(可以类比为程序语言中的指令),在执行过程中会为 Operator 创建所需的输入输出并进行管理。 - -从上面的过程中可以看到,Fluid 程序的执行过程分为:编译器的定义 :code:`Program` ,和创建 :code:`Executor` 运行 :code:`Program` 。 - :code:`Executor` 执行一段 :code:`Program` 的过程是不可交互和不可中断的。 - -在 Fluid 中,可以创建多余一段 :code:`Program` 。默认情况,一个 PaddleFluid 程序中存在 2 段 Program: - -1. :code:`fluid.framework.default_startup_program` :其中定义了创建模型参数,输入输出,以及模型中可学习参数的初始化等各种操作; - -- :code:`default_startup_program` 可以由框架自动生成,使用时无需显示地创建; -- 如果调用修改了参数的默认初始化方式,框架会自动的将相关的修改加入 :code:`default_startup_program` 。 - -2. :code:`fluid.framework.default_main_program` :定义了神经网络模型,前向反向计算,以及优化算法对网络中可学习参数的更新; - -- 使用 Fluid 的核心就是构建起 :code:`default_main_program` 。 - -3. PaddleFluid 中的 :code:`Scope` 类似于 TensorFlow 中的 collection 这一概念,但在 Fluid 中 :code:`Scope` 是框架后端概念,用户无法直接操作。因此,在使用框架时无需关心。 - -总结 -""""" - -Fluid 中通过 Executor 来执行一段用户定义的 Fluid :code:`Program` 。 -1. Executor 连接了 Fluid 的前端和后端; - -2. Executor 接受用户定义的原始模型(一段 :code:`Program` ),通过调用系统中不同功能更的 :code:`Transpiler` 完成对原始 :code:`Program` 的变化,进行优化。 - -完整实例:如何完成一个机器学习模型的训练 -=================================== - - - -这一节,我们以 MNIST 手写数字识别问题 —— 机器学习任务的“Hello World”问题和数据,为例,通过一个可以运行的完整实例,来学习上文介绍的概念如何在PaddleFluid 平台使用。 - -步骤1:定义数据 ----------------- - -PaddleFluid 中以 :code:`fluid.layers.data` 来接收输入数据。 - -:: - - import numpy as np - - import paddle.fluid as fluid - import paddle.v2 as paddle - - # define the input layers for the network. - x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32") - y_ = fluid.layers.data(name="label", shape=[1], dtype="int64") - -Fluid 中 Tensor 的第 0 维度固定为 batch size。在上面代码段中,图像输入 :code:`x` 的形状为:[1, 28, 28]。这三个维度的含义分别是:channel 数目,图像的高度和宽度。 - -实际上 Fluid 框架内部,一幅图像输入是一个 4-D Tensor,所有 Tensor 的第 0 维固定为 batch size。框架内部会自动为batch size进行填充占位。无需对batch size指定填充占位。 - -如果除去 batch size(第 0 维度)外,如果 Tensor 某一维度的大小只能在运行时确定,可以在该位置上直接指定 :code:`None` 进行占位。 - -步骤2:定义模型 --------------- - -通过调用 Fluid 提供的算子定义含有一个隐层的神经网络。Fluid 模型的分为模型结构和优化方法两部分。这一点与 TensorFlow 程序十分相似似,使用概念可以直接对应进行迁移。 - -:: - - # define the network topology. - y = fluid.layers.fc(input=x, size=10, act="softmax") - loss = fluid.layers.cross_entropy(input=y, label=y_) - avg_loss = fluid.layers.mean(loss) - - # define the optimization algorithm. - optimizer = fluid.optimizer.Adam(learning_rate=1e-3) - optimizer.minimize(avg_loss) - -Fluid 使用 Program 而不是计算图描述模型,一般情况下,用户无需关心 Program 的细节,当调用以上 layers 时,会向一个全局的 Program: :code:`fluid.framework.default_main_program` 中插入变量(Tensor)和对变量的操作(上述代码段中的 layers 和 optimzier)。 - -步骤3:参数初始化 ----------------- - -如上文介绍,Fluid 程序中的 Executor 是连接 Fluid 前端和后端的接口。 - -默认一个Fluid模型存在至少两段 Program。用于初始化网络中的可学习参数的那一段 :code:`Program` 叫作 :code:`fluid.default_startup_program()` 。 - -只有执行器 executor 可以执行 Fluid Program,因此,在初始化网络中的可学习参数之前,需要首先创建一个 Fluid executor。 - -:: - - # define the executor. - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - -在以上代码段中, :code:`place` 用于告诉 executor 一段 Fluid Program 在何种设备上执行, -常见的有 :code:`fluid.CPUPlace()` 和 :code:`fluid.CUDAPlace()` 。 - -步骤4:数据输入 + 执行模型训练 ----------------------------- - -我们在步骤 2 中定义的神经网络模型最终被插入一段叫做 :code:`fluid.framework.default_main_program` 的 Fluid Program 中。 - -网络可学习参数初始化之后,可以通过让执行器 Executor 执行这段 :code:`fluid.framework.default_main_program` 来进行训练。 - -:: - - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000), - batch_size=BATCH_SIZE) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y_]) - - for pass_id in range(100): - for batch_id, data in enumerate(train_reader()): - loss = exe.run( - fluid.framework.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_loss]) - print("Cur Cost : %f" % (np.array(loss[0])[0])) - -从上面的代码片段中可以看到,Fluid 程序的训练过程和 TensorFlow 程序的训练过程非常接近, -都放在一个 :code:`for` 循环中,循环读取一个 mini-batch 数据, -调用执行器执行 Fluid :code:`default_main_program` :接收 mini-batch 输入,在其上进行前向,反向和参数更新计算。 - -`注:上面程序使用了 Fluid 内置的 MNIST 数据,和我们提供给 TensorFlow 示例程序的 MNIST 数据完全一样。` - -步骤5:观察模型效果 ------------------ - -以上步骤已经构成了完整的 Tensorflow 模型训练程序,每个 batch 观察一次 loss,可以直观看到模型的迭代效果: - -.. figure:: fluid_mnist.png - - :scale: 40% - :align: center - - Figure.2 - - Fluid MNIST手写数字识别任务代价下降曲线 - -附:完整代码 ------------- - -:: - - import numpy as np - - import paddle.fluid as fluid - import paddle.v2 as paddle - - - def main(): - BATCH_SIZE = 128 - - # define the input layers for the network. - x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32") - y_ = fluid.layers.data(name="label", shape=[1], dtype="int64") - - # define the network topology. - y = fluid.layers.fc(input=x, size=10, act="softmax") - loss = fluid.layers.cross_entropy(input=y, label=y_) - avg_loss = fluid.layers.mean(loss) - - optimizer = fluid.optimizer.Adam(learning_rate=5e-3) - optimizer.minimize(avg_loss) - - # define the executor. - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000), - batch_size=BATCH_SIZE) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y_]) - - for pass_id in range(100): - for batch_id, data in enumerate(train_reader()): - loss = exe.run( - fluid.framework.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_loss]) - print("Cur Cost : %f" % (np.array(loss[0])[0])) - - if __name__ == "__main__": - main() diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg deleted file mode 100644 index 0a495901fafb85987e34acc3c454fb87e8160fca..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_local_train.jpeg and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png b/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png deleted file mode 100644 index e5ad0ba058c863cf68ef0789e58fcf67b3115fdb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/basic_concept/fluid_mnist.png and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst b/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst deleted file mode 100644 index 5946a2ccb7e43004eae39ec4b3c6112c66c1fd04..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/configure_simple_model/index.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _user_guide_configure_simple_model: - -############## -配置简单的网络 -############## - -在解决实际问题时,可以先从逻辑层面对问题进行建模,明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line `_。 - -问题描述及定义 -############## - -问题描述: 给定一组数据 :math:``,求解出函数 :math:`f`,使得 :math:`y=f(x)`,其中 :math:`x\subset X` 表示一条样本的特征,为 :math:`13` 维的实数向量;:math:`y \subset Y` 为一实数表示该样本对应的值。 - -我们可以尝试用回归模型来对问题建模,回归问题的损失函数有很多,这里选择常用的均方误差。为简化问题,这里假定 :math:`f` 为简单的线性变换函数,同时选用随机梯度下降算法来求解模型。 - -+----------------+----------------------------------------------+ -| 输入数据类型 | 样本特征: 13 维 实数 | -+ +----------------------------------------------+ -| | 样本标签: 1 维 实数 | -+----------------+----------------------------------------------+ -| 计算逻辑 | 使用线性模型,产生 1维实数作为模型的预测输出 | -+----------------+----------------------------------------------+ -| 求解目标 | 最小化模型预测输出与样本标签间的均方误差 | -+----------------+----------------------------------------------+ -| 优化算法 | 随机梯度下降 | -+----------------+----------------------------------------------+ - -使用PaddlePadle建模 -################### - -从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后,需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分,分别是:输入数据格式定义,模型前向计算逻辑,损失函数以及优化算法。 - -数据层 ------- - -PaddlePaddle提供了 :code:`fluid.layers.data()` 算子来描述输入数据的格式。 - -:code:`fluid.layers.data()` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力,可以表示多维数据。为了精确描述数据结构,通常需要指定数据shape以及数值类型type。其中shape为一个整数向量,type可以是一个字符串类型。目前支持的数据类型参考 :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据,而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size,所以这里提供shape时不用关心batch size,只需关心一条样本的shape即可,更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知,:math:`x` 为 :math:`13` 维的实数向量,:math:`y` 为实数,可使用下面代码定义数据层: - -.. code-block:: python - - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -该模型使用的数据比较简单,事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。 - -前向计算逻辑 ------------- - -实现一个模型最重要的部分是实现计算逻辑,PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同,通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子,以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子,一般地可以使用下面的方式: - -.. code-block:: python - - op_1_out = fluid.layers.op_1(input=op_1_in, ...) - op_2_out = fluid.layers.op_2(input=op_1_out, ...) - ... - -其中op_1和op_2表示算子类型,可以是fc来执行线性变换(全连接),也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中,op_1的输出是op_2的输入,那么在执行计算时,会先计算op_1,然后计算op_2。更复杂的模型可能需要使用控制流算子,依据输入数据来动态执行,针对这种情况,PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :code:`fluid.layers`。具体到这个任务, 我们使用一个fc算子: - -.. code-block:: python - - y_predict = fluid.layers.fc(input=x, size=1, act=None) - -损失函数 --------- - -损失函数对应求解目标,我们可以通过最小化损失来求解模型。大多数模型使用的损失函数,输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时,损失算子的输出有多个值,每个值对应一条样本的损失,所以通常会在损失算子后面使用mean等算子,来对损失做归约。模型在一次前向迭代后会得到一个损失值,PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失: - -.. code-block:: python - - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - -优化方法 --------- - -确定损失函数后,可以通过前向计算得到损失值,然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数,最简单的算法是随机梯度下降法::math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果,学术界先后提出了很多优化算法,包括: :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数,一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法,学习率一般是一个需要指定的比较重要的超参数,需要通过实验仔细调整。这里采用随机梯度下降算法: - -.. code-block:: python - - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - -更多优化算子可以参考 :code:`fluid.optimizer()` 。 - -下一步做什么? -############## - -使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同,涉及的计算逻辑不同,损失函数不同,优化方法也不同。PaddlePaddle提供了丰富的模型示例,可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 `_ 查看官方提供的示例。 diff --git a/doc/fluid/new_docs/user_guides/howto/debug/index.rst b/doc/fluid/new_docs/user_guides/howto/debug/index.rst deleted file mode 100644 index 0878e17b4069be6b08bc85a35e77ba6421633218..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/debug/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -############ -Debug 工具 -############ - -PaddlePaddle 提供了如下方式方便 Debug 训练 情况 - -.. toctree:: - :maxdepth: 2 - - visualdl.md diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md deleted file mode 100644 index 99f8bee5ca1519ccf5d7c35ad2a64da4a8841ada..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md +++ /dev/null @@ -1,219 +0,0 @@ -# VisualDL (Visualize the Deep Learning) -

- -

- -## 介绍 -VisualDL是一个面向深度学习任务设计的可视化工具,包含了scalar、参数分布、模型结构、图像可视化等功能,项目正处于高速迭代中,新的组件会不断加入。 - -目前大多数DNN平台均使用Python作为配置语言,VisualDL原生支持python的使用, -通过在模型的Python配置中添加几行,便可以为训练过程提供丰富的可视化支持。 - -除了Python SDK之外,VisualDL底层采用C++编写,其暴露的C++ SDK也可以集成到其他平台中, -实现原生的性能和定制效果。 - -## 组件 -VisualDL 目前支持4种组件: - -- graph -- scalar -- image -- histogram - -### Graph -兼容 ONNX(Open Neural Network Exchange)[https://github.com/onnx/onnx], 通过与 python SDK的结合,VisualDL可以兼容包括 PaddlePaddle, pytorch, mxnet在内的大部分主流DNN平台。 - -

- -

- -### Scalar -可以用于展示训练测试的误差趋势 - -

- -

- -### Image -可以用于可视化任何tensor,或模型生成的图片 - -

- -

- -### Histogram - -用于可视化任何tensor中元素分布的变化趋势 - -

- -

- -## 快速尝试 -请使用下面的命令,来快速测试 VisualDL。 - -``` -# 安装,建議是在虚拟环境或anaconda下。 -pip install --upgrade visualdl - -# 运行一个例子,vdl_create_scratch_log 将创建测试日志 -vdl_create_scratch_log -visualDL --logdir=scratch_log --port=8080 - -# 访问 http://127.0.0.1:8080 -``` - -如果以上步骤出现问题,很可能是因为python或pip不同版本或不同位置所致,以下安装方法能解决。 - -## 使用 virtualenv 安装 - -[Virtualenv](https://virtualenv.pypa.io/en/stable/) 能创建独立Python环境,也能确保Python和pip的相对位置正确。 - -在macOS上,安装pip和virtualenv如下: -``` -sudo easy_install pip -pip install --upgrade virtualenv -``` - -在Linux上,安装pip和virtualenv如下: -``` -sudo apt-get install python3-pip python3-dev python-virtualenv -``` - -然后创建一个虚拟环境: -``` -virtualenv ~/vdl # for Python2.7 -virtualenv -p python3 ~/vdl for Python 3.x -``` - -```~/vdl``` 是你的Virtualenv目录, 你也可以选择任一目录。 - -激活虚拟环境如下: -``` -source ~/vdl/bin/activate -``` - -现在再安装 VisualDL 和运行范例: - -``` -pip install --upgrade visualdl - -# 运行一个例子,vdl_create_scratch_log 将创建测试日志 -vdl_create_scratch_log -visualDL --logdir=scratch_log --port=8080 - -# 访问 http://127.0.0.1:8080 -``` -如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上,运行`pip install --upgrade protobuf`就能解决。 - -如果在虚拟环境下仍然遇到安装问题,请尝试以下方法。 - - -## 使用 Anaconda 安装 - -Anaconda是一个用于科学计算的Python发行版,提供了包管理与环境管理的功能,可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。 - -请根据[Anaconda下载网站](https://www.anaconda.com/download) 的指示去下载和安装Anaconda. -下载Python 3.6版本的command-Line installer. - -创建conda环境名字为```vdl```或任何名字: -``` -conda create -n vdl pip python=2.7 # or python=3.3, etc. -``` - -激活conda环境如下: -``` -source activate vdl -``` - -现在再安装 VisualDL 和运行范例: - -``` -pip install --upgrade visualdl - -# 运行一个例子,vdl_create_scratch_log 将创建测试日志 -vdl_create_scratch_log -visualDL --logdir=scratch_log --port=8080 - -# 访问 http://127.0.0.1:8080 -``` - -如果仍然遇到安装问题,请尝试以下用源代码安装方法。 - -### 使用代码安装 -``` -#建議是在虚拟环境或anaconda下。 -git clone https://github.com/PaddlePaddle/VisualDL.git -cd VisualDL - -python setup.py bdist_wheel -pip install --upgrade dist/visualdl-*.whl -``` - -如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md) - - -## SDK -VisualDL 同时提供了python SDK 和 C++ SDK 来实现不同方式的使用。 - -### Python SDK -VisualDL 现在支持 Python 2和 Python 3。 - -以最简单的Scalar组件为例,尝试创建一个scalar组件并插入多个时间步的数据: - -```python -import random -from visualdl import LogWriter - -logdir = "./tmp" -logger = LogWriter(logdir, sync_cycle=10000) - -# mark the components with 'train' label. -with logger.mode("train"): - # create a scalar component called 'scalars/scalar0' - scalar0 = logger.scalar("scalars/scalar0") - -# add some records during DL model running. -for step in range(100): - scalar0.add_record(step, random.random()) -``` - -### C++ SDK -上面 Python SDK 中代码完全一致的C++ SDK用法如下 -```c++ -#include -#include -#include "visualdl/sdk.h" - -namespace vs = visualdl; -namespace cp = visualdl::components; - -int main() { - const std::string dir = "./tmp"; - vs::LogWriter logger(dir, 10000); - - logger.SetMode("train"); - auto tablet = logger.AddTablet("scalars/scalar0"); - - cp::Scalar scalar0(tablet); - - for (int step = 0; step < 1000; step++) { - float v = (float)std::rand() / RAND_MAX; - scalar0.AddRecord(step, v); - } - - return 0; -} -``` -## 启动Board -当训练过程中已经产生了日志数据,就可以启动board进行实时预览可视化信息 - -``` -visualDL --logdir -``` - -board 还支持一下参数来实现远程的访问: - -- `--host` 设定IP -- `--port` 设定端口 -- `--model_pb` 指定 ONNX 格式的模型文件 diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst deleted file mode 100644 index 6f6698cadcba4d9645fdc4a8a74d899598b96d99..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/evaluation/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -############ -模型评估和调试 -############ - -PaddlePaddle Fluid提供了常用的模型评估指标,并提供了VisualDL工具可视化模型效果。 - -.. toctree:: - :maxdepth: 2 - - metrics diff --git a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst b/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst deleted file mode 100644 index f37968a50350a90e698cb1a63bd501635753e7fb..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/evaluation/metrics.rst +++ /dev/null @@ -1,62 +0,0 @@ -############ -模型评估 -############ - -模型评估是用指标反映模型在预期目标下精度,根据模型任务决定观察指标,作为在训练中调整超参数,评估模型效果的重要依据。 -metric函数的输入为当前模型的预测preds和labels,输出是自定义的。metric函数和loss函数非常相似,但是metric并不是模型训练网络组成部分。 - -用户可以通过训练网络得到当前的预测preds和labels,在Python端定制metric函数;也可以通过定制c++ Operator的方式,在GPU上加速metric计算。 - -paddle.fluid.metrics模块包含该功能 - - -常用指标 -############ - -metric函数根据模型任务不同,指标构建方法因任务而异。 - -回归类型任务labels是实数,因此loss和metric函数构建相同,可参考MSE的方法。 -分类任务常用指标为分类指标,本文提到的一般是二分类指标,多分类和多标签需要查看对应的API文档。例如排序指标auc,多分类可以作为0,1分类任务,auc指标仍然适用。 -Fluid中包含了常用分类指标,例如Precision, Recall, Accuracy等,更多请阅读API文档。以 :ref:`Precision` 为例,具体方法为 - -.. code-block:: python - - >>> import paddle.fluid as fluid - >>> labels = fluid.layers.data(name="data", shape=[1], dtype="int32") - >>> data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32") - >>> pred = fluid.layers.fc(input=data, size=1000, act="tanh") - >>> acc = fluid.metrics.Precision() - >>> for pass in range(PASSES): - >>> acc.reset() - >>> for data in train_reader(): - >>> loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - >>> acc.update(preds=preds, labels=labels) - >>> numpy_acc = acc.eval() - - -其他任务例如MultiTask Learning,Metric Learning,Learning To Rank各种指标构造方法请参考API文档。 - -自定义指标 -############ -Fluid支持自定义指标,灵活支持各类计算任务。下文通过一个简单的计数器metric函数,实现对模型的评估。 -其中preds是模型预测值,labels是给定的标签。 - -.. code-block:: python - - >>> class MyMetric(MetricBase): - >>> def __init__(self, name=None): - >>> super(MyMetric, self).__init__(name) - >>> self.counter = 0 # simple counter - - >>> def reset(self): - >>> self.counter = 0 - - >>> def update(self, preds, labels): - >>> if not _is_numpy_(preds): - >>> raise ValueError("The 'preds' must be a numpy ndarray.") - >>> if not _is_numpy_(labels): - >>> raise ValueError("The 'labels' must be a numpy ndarray.") - >>> self.counter += sum(preds == labels) - - >>> def eval(self): - >>> return self.counter diff --git a/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst deleted file mode 100644 index 3884284ea020fe94ed9c03ec84c856ee44aa8c3f..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _install_or_build_cpp_inference_lib: - -安装与编译C++预测库 -=========================== - -直接下载安装 -------------- - -====================== ======================================== -版本说明 C++预测库 -====================== ======================================== -cpu_avx_mkl `fluid.tgz `_ -cpu_avx_openblas `fluid.tgz `_ -cpu_noavx_openblas `fluid.tgz `_ -cuda7.5_cudnn5_avx_mkl `fluid.tgz `_ -cuda8.0_cudnn5_avx_mkl `fluid.tgz `_ -cuda8.0_cudnn7_avx_mkl `fluid.tgz `_ -cuda9.0_cudnn7_avx_mkl `fluid.tgz `_ -====================== ======================================== - -从源码编译 ----------- -用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项: - -================= ========= -选项 值 -================= ========= -CMAKE_BUILD_TYPE Release -FLUID_INSTALL_DIR 安装路径 -WITH_FLUID_ONLY ON(推荐) -WITH_SWIG_PY OFF(推荐 -WITH_PYTHON OFF(推荐) -WITH_GPU ON/OFF -WITH_MKL ON/OFF -================= ========= - -建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 - -下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): - - .. code-block:: bash - - pip install paddlepaddle-gpu - PADDLE_ROOT=/path/of/capi - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - mkdir build - cd build - cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_FLUID_ONLY=ON \ - -DWITH_SWIG_PY=OFF \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - .. - make - make inference_lib_dist - -成功编译后,使用C++预测库所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件;(3)版本信息与编译选项信息) -均会存放于PADDLE_ROOT目录中。目录结构如下: - - .. code-block:: text - - PaddleRoot/ - ├── CMakeCache.txt - ├── paddle - │   └── fluid - │   ├── framework - │   ├── inference - │   ├── memory - │   ├── platform - │   ├── pybind - │   └── string - ├── third_party - │   ├── boost - │   │   └── boost - │   ├── eigen3 - │   │   ├── Eigen - │   │   └── unsupported - │   └── install - │   ├── gflags - │   ├── glog - │   ├── mklml - │   ├── protobuf - │   ├── snappy - │   ├── snappystream - │   └── zlib - └── version.txt - -version.txt 中记录了该预测库的版本信息,包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号,如: - - .. code-block:: text - - GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8 - WITH_MKL: ON - WITH_GPU: ON - CUDA version: 8.0 - CUDNN version: v5 diff --git a/doc/fluid/new_docs/user_guides/howto/inference/index.rst b/doc/fluid/new_docs/user_guides/howto/inference/index.rst deleted file mode 100644 index 45e1a2883773b92ed47ef8d51417bbdcd060b4ec..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/inference/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -############ -模型预测部署 -############ - -PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线 - -.. toctree:: - :maxdepth: 2 - - build_and_install_lib_cn.rst - native_infer.rst diff --git a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst deleted file mode 100644 index 6d6f3035c0b5c985cd39d45df9f1bcce50dcefa0..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst +++ /dev/null @@ -1,106 +0,0 @@ -Paddle 预测 API -=============== - -为了更简单方便的预测部署,Fluid 提供了一套高层 API -用来隐藏底层不同的优化实现。 - -`预测库相关代码 `_ -包括 - -- 头文件 ``paddle_inference_api.h`` 定义了所有的接口 -- 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` - - -编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 - -下面是一些 API 概念的介绍 - -PaddleTensor ------------- - -PaddleTensor 定义了预测最基本的输入输出的数据格式,其定义是 - -.. code:: cpp - - struct PaddleTensor { - std::string name; // variable name. - std::vector shape; - PaddleBuf data; // blob of data. - PaddleDType dtype; - }; - -- ``name`` 用于指定输入数据对应的 模型中variable 的名字 - (暂时没有用,但会在后续支持任意 target 时启用) -- ``shape`` 表示一个 Tensor 的 shape -- ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf`` - 中,\ ``PaddleBuf`` - 可以接收外面的数据或者独立\ ``malloc``\ 内存,详细可以参考头文件中相关定义。 -- ``dtype`` 表示 Tensor 的数据类型 - -engine ------- - -高层 API 底层有多种优化实现,我们称之为 engine,目前有三种 engine - -- 原生 engine,由 paddle 原生的 forward operator - 组成,可以天然支持所有paddle 训练出的模型, -- Anakin engine,封装了 - `Anakin `__ - ,在某些模型上性能不错,但只能接受自带模型格式,无法支持所有 paddle - 模型, -- TensorRT mixed engine,用子图的方式支持了 - `TensorRT `__ ,支持所有paddle - 模型,并自动切割部分计算子图到 TensorRT 上加速(WIP) - -其实现为 - -.. code:: cpp - - enum class PaddleEngineKind { - kNative = 0, // Use the native Fluid facility. - kAnakin, // Use Anakin for inference. - kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops. - }; - -预测部署过程 ------------- - -总体上分为以下步骤 - -1. 用合适的配置创建 ``PaddlePredictor`` -2. 创建输入用的 ``PaddleTensor``\ ,传入到 ``PaddlePredictor`` 中 -3. 获取输出的 ``PaddleTensor`` ,将结果取出 - -下面完整演示一个简单的模型,部分细节代码隐去 - -.. code:: cpp - - #include "paddle_inference_api.h" - - // 创建一个 config,并修改相关设置 - paddle::NativeConfig config; - config.model_dir = "xxx"; - config.use_gpu = false; - // 创建一个原生的 PaddlePredictor - auto predictor = - paddle::CreatePaddlePredictor(config); - // 创建输入 tensor - int64_t data[4] = {1, 2, 3, 4}; - paddle::PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; - // 创建输出 tensor,输出 tensor 的内存可以复用 - std::vector outputs; - // 执行预测 - CHECK(predictor->Run(slots, &outputs)); - // 获取 outputs ... - -编译时,联编 ``libpaddle_fluid.a/.so`` 便可。 - -详细代码参考 ------------- - -- `inference - demos `__ -- `复杂单线程/多线程例子 `__ diff --git a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst b/doc/fluid/new_docs/user_guides/howto/modification/foo.rst deleted file mode 100644 index 9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/modification/foo.rst +++ /dev/null @@ -1,3 +0,0 @@ -### -FAQ -### diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst deleted file mode 100644 index c3bf033bb8316eeb4901c0cdc61e0556c8816dac..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/prepare_data/feeding_data.rst +++ /dev/null @@ -1,169 +0,0 @@ -.. _user_guide_use_numpy_array_as_train_data: - -########################### -使用Numpy Array作为训练数据 -########################### - -PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层; -再使用 Numpy Array 或者直接使用Python创建C++的 -:code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给 -:code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。 - -数据层配置 -########## - -通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为: - -.. code-block:: python - - import paddle.fluid as fluid - - image = fluid.layers.data(name="image", shape=[3, 224, 224]) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - # use image/label as layer input - prediction = fluid.layers.fc(input=image, size=1000, act="softmax") - loss = fluid.layers.cross_entropy(input=prediction, label=label) - ... - -上段代码中,:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data` -创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据; -:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是: - -1. Fluid中默认使用 :code:`-1` 表示 batch size 维度,默认情况下会在 :code:`shape` - 的第一个维度添加 :code:`-1` 。 所以 上段代码中, 我们可以接受将一个 - :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size - 维度的位置的话,请设置 :code:`fluid.layers.data(append_batch_size=False)` 。 - 请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。 - - -2. Fluid中用来做类别标签的数据类型是 :code:`int64`,并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。 - -.. _user_guide_feed_data_to_executor: - -传递训练数据给执行器 -#################### - -:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。 -这个参数是一个Python的字典。它的键是数据层的名字,例如上文代码中的 :code:`image`。 -它的值是对应的numpy array。 - -例如: - -.. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(feed={ - "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'), - "label": numpy.random.random(size=(32, 1)).astype('int64') - }) - -进阶使用 -######## - -如何传入序列数据 ----------------- - -序列数据是PaddlePaddle Fluid支持的特殊数据类型,可以使用 :code:`LoDTensor` 作为 -输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据; -2.每个序列的长度信息。 -用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。 - -传入序列信息的时候,需要设置序列嵌套深度,:code:`lod_level`。 -例如训练数据是词汇组成的句子,:code:`lod_level=1`;训练数据是 词汇先组成了句子, -句子再组成了段落,那么 :code:`lod_level=2`。 - -例如: - -.. code-block:: python - - sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1) - - ... - - exe.run(feed={ - "sentence": create_lod_tensor( - data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1), - lod=[4, 1, 2], - place=fluid.CPUPlace() - ) - }) - -训练数据 :code:`sentence` 包含三个样本,他们的长度分别是 :code:`4, 1, 2`。 -他们分别是 :code:`data[0:4]`, :code:`data[4:5]` 和 :code:`data[5:7]`。 - -如何分别设置ParallelExecutor中每个设备的训练数据 ------------------------------------------------- - -用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时, -可以显示指定每一个训练设备(例如GPU)上的数据。 -用户需要将一个列表传递给 :code:`feed` 参数,列表中的每一个元素都是一个字典。 -这个字典的键是数据层的名字,值是数据层的值。 - -例如: - -.. code-block:: python - - parallel_executor = fluid.ParallelExecutor() - parallel_executor.run( - feed=[ - { - "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'), - "label": numpy.random.random(size=(32, 1)).astype('int64') - }, - { - "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'), - "label": numpy.random.random(size=(16, 1)).astype('int64') - }, - ] - ) - -上述代码中,GPU0会训练 32 个样本,而 GPU1训练 16 个样本。 - - -.. _user_guide_customize_batch_size_rank: - -自定义BatchSize维度 -------------------- - -PaddlePaddle Fluid默认batch size是数据的第一维度,以 :code:`-1` 表示。但是在高级 -使用中,batch_size 可以固定,也可以是其他维度或者多个维度来表示。这都需要设置 -:code:`fluid.layers.data(append_batch_size=False)` 来完成。 - -1. 固定batch size维度 - - .. code-block:: python - - image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False) - - 这里,:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。 - -2. 使用其他维度表示batch size - - .. code-block:: python - - sentence = fluid.layers.data(name="sentence", - shape=[80, -1, 1], - append_batch_size=False, - dtype="int64") - - 这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经 - 网络中。 - - -.. _user_guide_paddle_support_data_types: - -Fluid目前支持的数据类型 ------------------------ - -PaddlePaddle Fluid目前支持的数据类型包括: - - * float16: 部分操作支持 - * float32: 主要实数类型 - * float64: 次要实数类型,支持大部分操作 - * int32: 次要标签类型 - * int64: 主要标签类型 - * uint64: 次要标签类型 - * bool: 控制流数据类型 - * int16: 次要标签类型 - * uint8: 输入数据类型,可用于图像像素 \ No newline at end of file diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst deleted file mode 100644 index cca3684b78518867eae95d82e1347b52427ddc81..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. _user_guide_prepare_data: - -######## -准备数据 -######## - -PaddlePaddle Fluid支持两种传入数据的方式: - -1. 用户需要使用 :code:`fluid.layers.data` -配置数据输入层,并在 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` -中,使用 :code:`executor.run(feed=...)` 传入训练数据。 - -2. 用户需要先将训练数据 -转换成 Paddle 识别的 :code:`fluid.recordio_writer` , 再使用 -:code:`fluid.layers.open_files` 以及 :code:`fluid.layers.reader` 配置数据读取。 - -这两种准备数据方法的比较如下: - -.. _user_guide_prepare_data_comparision: - -+------------+----------------------------------+---------------------------------------+ -| | Feed数据 | 使用Reader | -+============+==================================+=======================================+ -| API接口 | :code:`executor.run(feed=...)` | :code:`fluid.layers.reader` | -+------------+----------------------------------+---------------------------------------+ -| 数据格式 | Numpy Array | :code:`fluid.recordio_writer` | -+------------+----------------------------------+---------------------------------------+ -| 数据增强 | Python端使用其他库完成 | 使用Fluid中的Operator 完成 | -+------------+----------------------------------+---------------------------------------+ -| 速度 | 慢 | 快 | -+------------+----------------------------------+---------------------------------------+ -| 推荐用途 | 调试模型 | 工业训练 | -+------------+----------------------------------+---------------------------------------+ - -这些准备数据的详细使用方法,请参考: - -.. toctree:: - :maxdepth: 2 - - feeding_data - -Python Reader -############# - -为了方便用户在Python中定义数据处理流程,PaddlePaddle Fluid支持 Python Reader, -具体请参考: - -.. toctree:: - :maxdepth: 2 - - reader.md diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md b/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md deleted file mode 100644 index aa50e4d26166536eaf8044d527debd8ad46060f6..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/prepare_data/reader.md +++ /dev/null @@ -1,210 +0,0 @@ -```eval_rst -.. _user_guide_reader: -``` - -# Python Reader - -During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: - -- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. -- A *reader creator*: A function that returns a reader function. -- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. -- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. - -and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. - -## Data Reader Interface - -*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: - -``` -iterable = data_reader() -``` - -The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) - -An example implementation for single item data reader creator is as follows: - -```python -def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader -``` - -An example implementation for multiple item data reader creator is as follows: -```python -def reader_creator_random_image_and_label(width, height, label): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height), label - return reader -``` - -## Batch Reader Interface - -*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. - -Here are some valid outputs: - -```python -# a mini batch of three data items. Each data item consist three columns of data, each of which is 1. -[(1, 1, 1), -(2, 2, 2), -(3, 3, 3)] - -# a mini batch of three data items, each data item is a list (single column). -[([1,1,1],), -([2,2,2],), -([3,3,3],)] -``` - -Please note that each item inside the list must be a tuple, below is an invalid output: -```python - # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three columns of data, each of which is 1. -[[1,1,1], -[2,2,2], -[3,3,3]] -``` - -It is easy to convert from a reader to a batch reader: - -```python -mnist_train = paddle.dataset.mnist.train() -mnist_train_batch_reader = paddle.batch(mnist_train, 128) -``` - -It is also straight forward to create a custom batch reader: - -```python -def custom_batch_reader(): - while True: - batch = [] - for i in xrange(128): - batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. - yield batch - -mnist_random_image_batch_reader = custom_batch_reader -``` - -## Usage - -Following is how we can use the reader with PaddlePaddle: -The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: - -```python -# two data layer is created: -image_layer = paddle.layer.data("image", ...) -label_layer = paddle.layer.data("label", ...) - -# ... -batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) -paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) -``` - -## Data Reader Decorator - -The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. - -Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: - -### Prefetch Data - -Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. - -Use `paddle.reader.buffered` to prefetch data: - -```python -buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) -``` - -`buffered_reader` will try to buffer (prefetch) `100` data entries. - -### Compose Multiple Data Readers - -For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). - -We can do the following : - -```python -def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader - -def reader_creator_bool(t): - def reader: - while True: - yield t - return reader - -true_reader = reader_creator_bool(True) -false_reader = reader_creator_bool(False) - -reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) -# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care about the second item at this time. -paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) -``` - -### Shuffle - -Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. - -Example: -```python -reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) -``` - -## Q & A - -### Why does a reader return only a single entry, and not a mini batch? - -Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). - -We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. - -### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? - -In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. - -### Why use a dictionary instead of a list to provide mapping? - -Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). - -### How to create a custom data reader creator ? - -```python -def image_reader_creator(image_path, label_path, n): - def reader(): - f = open(image_path) - l = open(label_path) - images = numpy.fromfile( - f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') - images = images / 255.0 * 2.0 - 1.0 - labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") - for i in xrange(n): - yield images[i, :], labels[i] # a single entry of data is created each time - f.close() - l.close() - return reader - -# images_reader_creator creates a reader -reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) -paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) -``` - -### How is `paddle.train` implemented - -An example implementation of paddle.train is: - -```python -def train(batch_reader, mapping, batch_size, total_pass): - for pass_idx in range(total_pass): - for mini_batch in batch_reader(): # this loop will never end in online learning. - do_forward_backward(mini_batch, mapping) -``` diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md deleted file mode 100644 index c4afd536c67b24a17e4437ecedf779ddcddcbc98..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_cn.md +++ /dev/null @@ -1,60 +0,0 @@ -# Checkpoint功能使用指南 - -## 背景 -单机/多机在训练过程中会由于软件/硬件的问题出现异常,导致训练中断,进而导致训练无结果或结果不可用,浪费大量时间和机器性能。 - -## 目的 -Checkpoint功能能够在训练中途对训练数据中间数据进行保存,出现异常恢复训练的时候能够加载中途保存的数据继续训练, 实现单机/多机的容错训练的功能。 - -## 说明 -### 目前已实现的参数保存: -1. 基于Trainer 0 实现训练过程中的参数保存 -2. 基于PServer 实现了```Distribute Lookup Table```相关参数保存 -### Fluid Checkpoint 保存数据目录结构: - -``` -checkpoint_dir (用户定义的checkpoint目录) -├── checkpoint_0 (第一次保存) -│ ├── __lockup_table__ (Distribute Lookup Table 目录) -│ │ ├── table_pserver_0 (Pserver 0 号保存的lookup table 数据) -│ │ └── table_pserver_1 -│ ├── __model__ (model 目录) -│ │ └── var.w_1 -│ └── trainer_0 (trainer 自有数据保存) -│ ├── epoch_id -│ └── step_id -└── checkpoint_1 (第二次保存) -``` - -## 使用方法 -### 声明Fluid.CheckpointConfig -用户对checkpoint功能的配置,主要是配置对象```Fluid```中的```CheckpointConfig```. - -```CheckpointConfig``` 包括4个参数: - -| 参数 | 类型 | 说明 | -| - | :-: | - | -| checkpoint_dir | int| checkpoint存储目录 | -| max_num_checkpoints | int | 最大保存的checkpoint副本数 | -| epoch_interval | int | 每隔epoch_interval轮epoch | -| step_interval | int | 每隔step_interval轮step | - -### 在Fluid.Trainer对象的声明中加入Fluid.CheckpointConfig的声明 -Trainer的__init__方法的参数中包含了对```CheckpointConfig```, 需要传入在声明Trainer前声明的```CheckpointConfig```对象。 -如: -```python -config = CheckpointConfig( - checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, - epoch_interval = 2, step_interval = 10) -trainer = Trainer(..., checkpoint_config=config) -``` -定义和声明完成后, 训练在运行过程中就会在指定的step和epoch处进行保存,出现异常时,就会自动从最新的checkpoint目录进行参数恢复啦! - -## 相关API -[Trainer API 说明](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py) - -## 注意 -1. 保证每个训练的```checkpoint_dir``` 与其他训练独立。 -2. 最大副本数量```max_num_checkpoints```需要根据磁盘容量以及模型的大小进行调整, 保证磁盘的可用性。 -3. ```epoch_interval``` 和 ```step_interval``` 不宜过小, 频繁的进行checkpoint会拖慢训练速度。 -4. **分布式训练**的过程中:每个Trainer都会在```checkpoint_dir```目录中保存当前Trainer的参数(只有Trainer 0会保存模型的参数),需要**分布式文件系统(HDFS等)**将同```checkpoint_dir```目录的数据进行合并才能得到完整的数据,恢复训练的时候需要用完整的数据进行恢复。 diff --git a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md b/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md deleted file mode 100644 index 14d37246ca0cab8715e244fda9624d0d59f8ec5f..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/checkpoint_doc_en.md +++ /dev/null @@ -1,62 +0,0 @@ -# Checkpoint User Guide - -## Background -In many cases, Stand-alone training and Distributed training can be aborted by the software problem or hardware problem. More seriously, we waste so much time and the performance of the machine but get nothing, which makes us frustrating and we have to restart it again. - -## Purpose -The feature of ```Checkpoint``` can save Intermediate model variables, lookup table variable, and other needs data in checkpoint directory. When the exception occurs, we can load these variables from the checkpoint directory immediately. -## Introduce -### Complete Features Currently: -1. The Trainer 0 will save model variables in training. -2. Each of the Trainer will save its own arguments needed. -3. Each of the Parameter Server will save ```Distribute Lookup Table``` variables in training. -### Fluid Checkpoint directory structure: - -``` -checkpoint_dir (the checkpoint directory user define) -├── checkpoint_0 (the first save directory) -│ ├── __lockup_table__ (Distribute Lookup Table directory) -│ │ ├── table_pserver_0 (Lookup table's data about Pserver 0) -│ │ └── table_pserver_1 -│ ├── __model__ (model directory) -│ │ └── var.w_1 -│ └── trainer_0 (each trainer will save its own data) -│ ├── epoch_id -│ └── step_id -└── checkpoint_1 (the second save directory) -``` - -## usage -### Fluid.CheckpointConfig construct -When the user wants to use ```Checkpoint``` feature, the main thing user have to do is declare ```CheckpointConfig``` and construct it. - -```CheckpointConfig``` has 4 member variables need to be initialized: - -| Member Variable | Type | Comment | -| - | :-: | - | -| checkpoint_dir | int| checkpoint directory | -| max_num_checkpoints | int | Maximum number of checkpoint copies | -| epoch_interval | int | epoch interval times | -| step_interval | int | step interval times | - -### Add Fluid.CheckpointConfig's declaration in Fluid.Trainer -Because the initialization of Trainer needs an instance of ```CheckpointConfig```., we should declare ```CheckpointConfig``` in ```Fluid``` first. - -For example: -```python -config = CheckpointConfig( - checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, - epoch_interval = 2, step_interval = 10) -trainer = Trainer(..., checkpoint_config=config) -``` - -After all the things done, the train will save checkpoint at the specified epoch and step, when the train is aborted, the user can restart it, the train will restore from the latest copy. - -## Related API -[Related Trainer API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py) - -## Attention -1. Make the ```checkpoint_dir``` only be used by one train job. -2. The number of ```max_num_checkpoints``` need to be adjusted by the disk size and model size. -3. Too frequently to slow down the train speed, so too ```small epoch_interval``` and ```step_interval``` are not suitable. -4. **In distributed train**, each Trainer will save arguments in its ```checkpoint_dir``` (Only Trainer 0 will save model variables). We need **distributed file system (HDFS, etc)** to merge all the ```checkpoint_dir``` to get the whole data. diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst deleted file mode 100644 index 00ec9e819c81fae3263b1f1e6bcedf524f2b3991..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/cluster_howto.rst +++ /dev/null @@ -1,160 +0,0 @@ -.. _cluster_howto - -Fluid分布式训练使用手册 -==================== - -分布式训练基本思想 ---------------- - -分布式深度学习训练通常分为两种并行化方法:数据并行,模型并行,参考下图: - -.. image:: src/parallelism.png - -在模型并行方式下,模型的层和参数将被分布在多个节点上,模型在一个mini-batch的前向和反向训练中,将经过多次跨\ -节点之间的通信。每个节点只保存整个模型的一部分;在数据并行方式下,每个节点保存有完整的模型的层和参数,每个节点\ -独自完成前向和反向计算,然后完成梯度的聚合并同步的更新所有节点上的参数。Fluid目前版本仅提供数据并行方式,另外\ -诸如模型并行的特例实现(超大稀疏模型训练)功能将在后续的文档中予以说明。 - -在数据并行模式的训练中,Fluid使用了两种通信模式,用于应对不同训练任务对分布式训练的要求,分别为RPC通信和Collective -通信。其中RPC通信方式使用 `gRPC `_ ,Collective通信方式使用 -`NCCL2 `_ 。 - -.. csv-table:: 下面是一个RPC通信和Collective通信的横向对比: - :header: "Feature", "Coolective", "RPC" - - "Ring-Based通信", "Yes", "No" - "异步训练", "Yes", "Yes" - "分布式模型", "No", "Yes" - "容错训练", "No", "Yes" - "性能", "Faster", "Fast" - -- RPC通信方式的结构: - - .. image:: src/dist_train_pserver.png - - 使用RPC通信方式的数据并行分布式训练,会启动多个pserver进程和多个trainer进程,每个pserver进程\ - 会保存一部分模型参数,并负责接收从trainer发送的梯度并更新这些模型参数;每个trainer进程会保存一份\ - 完整的模型,并使用一部分数据进行训练,然后向pserver发送梯度,最后从pserver拉取更新后的参数。 - - pserver进程可以在和trainer完全不同的计算节点上,也可以和trainer公用节点。一个分布式任务所需要的\ - pserver进程个数通常需要根据实际情况调整,已达到最佳的性能,然而通常来说pserver的进程不会比trainer\ - 更多。 - - 在使用GPU训练时,pserver可以选择使用GPU或只使用CPU,如果pserver也使用GPU,则会增加一次从CPU拷贝\ - 接收到的梯度数据到GPU的开销,在某些情况下会导致整体训练性能降低。 - -- NCCL2通信方式的结构: - - .. image:: src/dist_train_nccl2.png - - 使用NCCL2(Collective通信方式)进行分布式训练,是不需要启动pserver进程的,每个trainer进程都保存\ - 一份完整的模型参数,在完成计算梯度之后通过trainer之间的相互通信,Reduce梯度数据到所有节点的所有设备\ - 然后每个节点在各自完成参数更新。 - -使用parameter server方式的训练 ------------------------------- - -使用 :code:`trainer` API,程序可以自动的通过识别环境变量决定是否已分布式方式执行。 - -.. csv-table:: 需要在您的分布式环境中配置的环境变量包括: - :header: "环境变量", "说明" - - "PADDLE_TRAINING_ROLE", "当前进程的角色,可以是PSERVER或TRAINER" - "PADDLE_PSERVER_PORT", "parameter使用的端口" - "PADDLE_PSERVER_IPS", "parameter server的IP地址列表,用逗号分开" - "PADDLE_TRAINERS", "分布式任务中trainer节点的个数" - "PADDLE_CURRENT_IP", "当前节点的IP" - "PADDLE_TRAINER_ID", "trainer节点的id,从0~n-1,不能有重复" - -使用更加底层的 :code:`transpiler` API可以提供自定义的分布式训练的方法,比如可以在同一台机器上, -启动多个pserver和trainer进行训练,使用底层API的方法可以参考下面的样例代码: - -.. code-block:: python - - role = "PSERVER" - trainer_id = 0 - pserver_endpoints = "127.0.0.1:6170,127.0.0.1:6171" - current_endpoint = "127.0.0.1:6170" - trainers = 4 - t = fluid.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) - if role == "PSERVER": - pserver_prog = t.get_pserver_program(current_endpoint) - pserver_startup = t.get_startup_program(current_endpoint, - pserver_prog) - exe.run(pserver_startup) - exe.run(pserver_prog) - elif role == "TRAINER": - train_loop(t.get_trainer_program()) - - -选择同步或异步训练 -++++++++++++++++++ - -Fluid分布式任务可以支持同步训练或异步训练,在同步训练方式下,所有的trainer节点,会在每个mini-batch -同步地合并所有节点的梯度数据并发送给parameter server完成更新,在异步训练方式下,每个trainer没有相互\ -同步等待的过程,可以独立的parameter server的参数。通常情况下,使用异步训练方式,可以在trainer节点\ -更多的时候比同步训练方式有更高的总体吞吐量。 - -在调用 :code:`transpile` 函数时,默认会生成同步训练的分布式程序,通过指定 :code:`sync_mode=False` -参数即可生成异步训练的程序: - -.. code-block:: python - - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=False) - - -选择参数分布方法 -++++++++++++++++ - -参数 :code:`split_method` 可以指定参数在parameter server上的分布方式。 - -Fluid默认使用 `RoundRobin `_ -方式将参数分布在多个parameter server上。此方式在默认未关闭参数切分的情况下,参数会较平均的分布在所有的 -parameter server上。如果需要使用其他,可以传入其他的方法,目前可选的方法有: :code:`RoundRobin` 和 -:code:`HashName` 。也可以使用自定义的分布方式,只需要参考 -`这里 `_ -编写自定义的分布函数。 - - -关闭切分参数 -++++++++++++ - -参数 :code:`slice_var_up` 指定是否将较大(大于8192个元素)的参数切分到多个parameter server已均衡计算负载,默认为开启。 - -当模型中的可训练参数体积比较均匀或者使用自定义的参数分布方法是参数均匀分布在多个parameter server上, -可以选择关闭切分参数,这样可以降低切分和重组带来的计算和拷贝开销: - -.. code-block:: python - - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, slice_var_up=False) - - -使用NCCL2通信方式的训练 --------------------- - -注NCCL2模式目前仅支持trainer API,NCCL2方式并没有很多可选项,也没有"transpiler",所以并没有底层API。 -使用NCCL2方式同样需要配置每个节点的环境变量,此处与parameter server模式有所不同,并不需要启动独立的\ -parameter server的进程,只需要启动多个trainer进程即可。 - - -.. csv-table:: NCCL2模式环境变量说明: - :header: "环境变量", "说明" - - "PADDLE_TRAINER_IPS", "所有Trainer节点的IP列表,用逗号分隔" - "PADDLE_TRAINER_ID", "trainer节点的id,从0~n-1,不能有重复" - "PADDLE_PSERVER_PORT", "一个端口,用于在NCCL2初始化时,广播NCCL ID" - "PADDLE_CURRENT_IP", "当前节点的IP" - -目前使用NCCL2进行分布式训练仅支持同步训练方式。使用NCCL2方式的分布式训练,更适合模型体积较大,并需要使用\ -同步训练和GPU训练,如果硬件设备支持RDMA和GPU Direct,可以达到很高的分布式训练性能。 - -注意如果系统中有多个网络设备,需要手动指定NCCL2使用的设备, -假设需要使用 :code:`eth2` 为通信设备,需要设定如下环境变量: - -.. code-block:: bash - - export NCCL_SOCKET_IFNAME=eth2 - -另外NCCL2提供了其他的开关环境变量,比如指定是否开启GPU Direct,是否使用RDMA等,详情可以参考 -`ncclknobs `_ 。 diff --git a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst b/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst deleted file mode 100644 index 6131c92d6f5386c7e91b2917d25dd7ae830ff182..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/cluster_quick_start.rst +++ /dev/null @@ -1,143 +0,0 @@ -.. _cluster_quick_start: - -分布式训练快速开始 -================== - -准备工作 --------- - -在本篇文章中,我们将会在介绍如何快速在一个集群中启动一个 PaddlePaddle -的分布式训练任务,在开始之前,请按如下步骤做些准备工作: - -1. 准备一个至少4个节点的集群,并且保证网络可以联通,在本文中我们使用 - ``*.paddlepaddle.com`` 来表示每个节点的主机名称,您可以根据集群的实际情况来修改它。 - -2. 在开始之前确保已经阅读过 :ref:`how_to_install` - 并且可以在集群的所有节点上可以正常运行 PaddlePaddle。 - -启动集群训练任务 ----------------- - -在启动集群训练脚本时,需要在不同的节点上指定不同的环境变量,具体如下: - -+-----------------+-----------------+-----------------+---------------------+ -| 环境变量 | 数据类型 | 样例 | 描述 | -+=================+=================+=================+=====================+ -| PADDLE_TRAINING | str | PSERVER,TRAINER | 训练节点的角色 | -| _ROLE | | | | -+-----------------+-----------------+-----------------+---------------------+ -| PADDLE_PSERVER_ | str | ps0.paddlepaddl | 所有 pserver | -| IPS | | e.com,ps1.paddl | 节点的 IP | -| | | epaddle.com… | 地址或 | -| | | | hostname, | -| | | | 用“,”分隔 | -+-----------------+-----------------+-----------------+---------------------+ -| PADDLE_PSERVER_ | int | 6174 | pserver | -| PORT | | | 节点监听的端口 | -+-----------------+-----------------+-----------------+---------------------+ -| PADDLE_TRAINERS | int | 2 | 训练任务中 | -| | | | trainer | -| | | | 节点的数量 | -+-----------------+-----------------+-----------------+---------------------+ -| PADDLE_CURRENT_ | str | ps0.paddlepaddl | 当前 pserver | -| IP | | e.com | 节点的 IP | -| | | | 地址或 hostanme | -+-----------------+-----------------+-----------------+---------------------+ -| PADDLE_TRAINER_ | int | 0 | 当前 trainer | -| ID | | | 节点的唯一 ID, | -| | | | 取值范围为从0开始到 | -| | | | PADDLE_TRAINERS-1 | -+-----------------+-----------------+-----------------+---------------------+ - -样例代码 -~~~~~~~~ - -将下面程序代码保存为 ``fluid_dist.py`` - -.. code:: python - - import paddle - import paddle.fluid as fluid - import contextlib - import numpy - import unittest - - # train reader - BATCH_SIZE = 20 - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) - - test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.test(), buf_size=500), - batch_size=BATCH_SIZE) - - - def train_program(): - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - - loss = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(loss) - - return avg_loss - - def optimizer_func(): - return fluid.optimizer.SGD(learning_rate=0.001) - - def train(use_cuda, train_program): - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - trainer = fluid.Trainer( - train_func=train_program, place=place, optimizer_func=optimizer_func) - - def event_handler(event): - if isinstance(event, fluid.EndStepEvent): - if event.step == 10: - test_metrics = trainer.test( - reader=test_reader, feed_order=['x', 'y']) - print("step {0}, loss: {1}".format(event.step, test_metrics)) - trainer.stop() - - trainer.train( - reader=train_reader, - num_epochs=100, - event_handler=event_handler, - feed_order=['x', 'y']) - - train(False, train_program) - -启动trainer节点和pserver节点 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. list-table:: - :header-rows: 1 - - * - 启动节点 - - 启动命令 - - 说明 - * - ps0.paddlepaddle.com - - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps0.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py` - - 启动 pserver 节点 - * - ps1.paddlepaddle.com - - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps1.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py` - - 启动 pserver 节点 - * - trainer0.paddlepaddle.com - - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=0 PADDLE_PSERVER_PORT=6174 python fluid_dist.py` - - 启动第0号 trainer 节点 - * - trainer1.paddlepaddle.com - - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=1 PADDLE_PSERVER_PORT=6174 python fluid_dist.py` - - 启动第1号 trainer 节点 - -**注意** - -- 需要先启动pserver节点再启动trainer节点 -- 看到trainer节点输出如下日志表示训练任务执行正确 - - .. code:: bash - - step 10, loss: [258.2326202392578] diff --git a/doc/fluid/new_docs/user_guides/howto/training/index.rst b/doc/fluid/new_docs/user_guides/howto/training/index.rst deleted file mode 100644 index 68475101e26b3f695c8003995cc1c6a95426ff27..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/index.rst +++ /dev/null @@ -1,12 +0,0 @@ -############ -训练神经网络 -############ - -PaddlePaddle Fluid支持单机训练,和多节点训练。每种训练模式下,都支持多种训练方法。 - -.. toctree:: - :maxdepth: 2 - - single_node - multi_node - save_load_variables diff --git a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst b/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst deleted file mode 100644 index 24316f0be0d8f211e680fa15cb432732b5967c79..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/multi_node.rst +++ /dev/null @@ -1,9 +0,0 @@ -######## -多机训练 -######## - -.. toctree:: - :maxdepth: 2 - - cluster_quick_start.rst - cluster_howto.rst diff --git a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst b/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst deleted file mode 100644 index a96776f4a17a1d6da170bdff9d81771c38912bb5..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/save_load_variables.rst +++ /dev/null @@ -1,172 +0,0 @@ -.. _user_guide_save_load_vars: - -################## -保存与载入模型变量 -################## - -模型变量分类 -############ - -在PaddlePaddle Fluid中,所有的模型变量都用 :code:`fluid.Variable()` 作为基类进行表示。 -在该基类之下,模型变量主要可以分为以下几种类别: - -1. 模型参数 - 模型参数是深度学习模型中被训练和学习的变量,在训练过程中,训练框架根据反向传播算法计算出每一个模型参数当前的梯度, - 并用优化器根据梯度对参数进行更新。模型的训练过程本质上可以看做是模型参数不断迭代更新的过程。 - 在PaddlePaddle Fluid中,模型参数用 :code:`fluid.framework.Parameter` 来表示, - 这是一个 :code:`fluid.Variable()` 的派生类,除了 :code:`fluid.Variable()` 具有的各项性质以外, - :code:`fluid.framework.Parameter` 还可以配置自身的初始化方法、更新率等属性。 - -2. 长期变量 - 长期变量指的是在整个训练过程中持续存在、不会因为一个迭代的结束而被销毁的变量,例如动态调节的全局学习率等。 - 在PaddlePaddle Fluid中,长期变量通过将 :code:`fluid.Variable()` 的 :code:`persistable` - 属性设置为 :code:`True` 来表示。所有的模型参数都是长期变量,但并非所有的长期变量都是模型参数。 - -3. 临时变量 - 不属于上面两个类别的所有模型变量都是临时变量,这种类型的变量只在一个训练迭代中存在,在每一个迭代结束后, - 所有的临时变量都会被销毁,然后在下一个迭代开始之前,又会先构造出新的临时变量供本轮迭代使用。 - 一般情况下模型中的大部分变量都属于这一类别,例如输入的训练数据、一个普通的layer的输出等等。 - - - -如何保存模型变量 -################ - -根据用途的不同,我们需要保存的模型变量也是不同的。例如,如果我们只是想保存模型用来进行以后的预测, -那么只保存模型参数就够用了。但如果我们需要保存一个checkpoint以备将来恢复训练, -那么我们应该将各种长期变量都保存下来,甚至还需要记录一下当前的epoch和step的id。 -因为一些模型变量虽然不是参数,但对于模型的训练依然必不可少。 - -因此,根据需求的不同,我们提供了两套API来分别进行模型的参数和checkpoint的保存。 - -保存模型用于对新样本的预测 -========================== - -如果我们保存模型的目的是用于对新样本的预测,那么只保存模型参数就足够了。我们可以使用 -:code:`fluid.io.save_params()` 接口来进行模型参数的保存。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - fluid.io.save_params(executor=exe, dirname=param_path, main_program=None) - -上面的例子中,通过调用 :code:`fluid.io.save_params` 函数,PaddlePaddle Fluid会对默认 -:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描, -筛选出其中所有的模型参数,并将这些模型参数保存到指定的 :code:`param_path` 之中。 - - -保存checkpoint用于将来恢复训练 -============================== - -在训练过程中,我们可能希望在一些节点上将当前的训练状态保存下来, -以便在将来需要的时候恢复训练环境继续进行训练。这一般被称作“checkpoint”。 -想要保存checkpoint,可以使用 :code:`fluid.io.save_checkpiont()` 接口。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - trainer_args = {"epoch_id": 200, - "step_id": 20} # just an example - fluid.io.save_checkpoint(executor=exe, - checkpoint_dir=path, - trainer_id=0, - trainer_args=trainer_args, - main_program=prog, - max_num_checkpoints=3) - -上面的例子中,通过调用 :code:`fluid.io.save_checkpoint` 函数,PaddlePaddle Fluid会对默认 -:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描, -根据一系列内置的规则自动筛选出其中所有需要保存的变量,并将他们保存到指定的 :code:`path` 目录下。 - -:code:`fluid.io.save_checkpoint` 的各个参数中, :code:`trainer_id` 在单机情况下设置为0即可; :code:`trainer_args` -为一个Python dict,用于给定当前的epoch_id和step_id; -:code:`max_num_checkpoints` 用于表示的最大checkpoint数量, -如果目录中已经存在的checkpoint数量超过这个值,那最早的checkpoint将被删除。 - -如何载入模型变量 -################ - -与模型变量的保存相对应,我们提供了两套API来分别载入模型的参数和载入模型的checkpoint。 - -载入模型用于对新样本的预测 -========================== - -对于通过 :code:`fluid.io.save_params` 保存的模型,可以使用 :code:`fluid.io.load_params` -来进行载入。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - fluid.io.load_params(executor=exe, dirname=param_path, - main_program=prog) - -上面的例子中,通过调用 :code:`fluid.io.load_params` 函数,PaddlePaddle Fluid会对 -:code:`prog` 中的所有模型变量进行扫描,筛选出其中所有的模型参数, -并尝试从 :code:`param_path` 之中读取加载它们。 - -需要格外注意的是,这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_params` -时所用的 :code:`prog` 中的前向部分完全一致,且不能包含任何参数更新的操作。如果两者存在不一致, -那么可能会导致一些变量未被正确加载;如果错误地包含了参数更新操作,那可能会导致正常预测过程中参数被更改。 -这两个 :code:`fluid.Program` 之间的关系类似于训练 :code:`fluid.Program` -和测试 :code:`fluid.Program` 之间的关系,详见: :ref:`user_guide_test_while_training`。 - -另外,需特别注意运行 :code:`fluid.default_startup_program()` 必须在调用 :code:`fluid.io.load_params` -之前。如果在之后运行,可能会覆盖已加载的模型参数导致错误。 - - -载入checkpoint用于恢复训练 -========================== - -对于通过 :code:`fluid.io.save_checkpoint` 保存的模型,可以使用 :code:`fluid.io.load_checkpoint` -来进行载入。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path, - serial=9, main_program=prog) - -上面的例子中,通过调用 :code:`fluid.io.save_checkpoint` 函数,PaddlePaddle Fluid会对 -:code:`prog` 中的所有模型变量进行扫描,根据内置规则自动筛选出需要加载的变量, -并尝试从 :code:`path` 之中加载它们。 - -参数 :code:`serial` 用来标记具体要加载的checkpoint的版本号。在保存checkpoint的时候, -一个checkpoint会被保存在一个子目录中,并在目录名上体现出自己的版本号。 -一般越大的版本号表示这个checkpoint越新。 - -这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_checkpoint` 时所用的 :code:`prog` -完全一致,否则会导致变量加载错误或者未加载。另外,与 :code:`fluid.io.save_params` 类似, -运行 :code:`fluid.default_startup_program()` 也必须在 :code:`fluid.io.load_checkpoint` -之前进行。 - -多机checkpoint保存 -################## - -.. toctree:: - :maxdepth: 2 - - checkpoint_doc_cn.md \ No newline at end of file diff --git a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst b/doc/fluid/new_docs/user_guides/howto/training/single_node.rst deleted file mode 100644 index 23eac0f831f2d6d052b7fc35b536d4ab633df851..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/single_node.rst +++ /dev/null @@ -1,119 +0,0 @@ -######## -单机训练 -######## - -准备工作 -######## - -要进行PaddlePaddle Fluid单机训练,需要先 :ref:`user_guide_prepare_data` 和 -:ref:`user_guide_configure_simple_model` 。当\ -:ref:`user_guide_configure_simple_model` 完毕后,可以得到两个\ -:code:`fluid.Program`, :code:`startup_program` 和 :code:`main_program`。 -默认情况下,可以使用 :code:`fluid.default_startup_program()` 与\ :code:`fluid.default_main_program()` 获得全局的 :code:`fluid.Program`。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - image = fluid.layers.data(name="image", shape=[784]) - label = fluid.layers.data(name="label", shape=[1]) - hidden = fluid.layers.fc(input=image, size=100, act='relu') - prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') - loss = fluid.layers.mean( - fluid.layers.cross_entropy( - input=prediction, - label=label - ) - ) - - sgd = fluid.optimizer.SGD(learning_rate=0.001) - sgd.minimize(loss) - - # Here the fluid.default_startup_program() and fluid.default_main_program() - # has been constructed. - -在上述模型配置执行完毕后, :code:`fluid.default_startup_program()` 与\ -:code:`fluid.default_main_program()` 配置完毕了。 - -初始化参数 -########## - -参数随机初始化 -============== - -用户配置完模型后,参数初始化操作会被写入到\ -:code:`fluid.default_startup_program()` 中。使用 :code:`fluid.Executor()` 运行 -这一程序,即可在全局 :code:`fluid.global_scope()` 中随机初始化参数。例如: - -.. code-block:: python - - exe = fluid.Executor(fluid.CUDAPlace(0)) - exe.run(program=fluid.default_startup_program()) - -值得注意的是: 如果使用多GPU训练,参数需要先在GPU0上初始化,再经由\ -:code:`fluid.ParallelExecutor` 分发到多张显卡上。 - - -载入预定义参数 -============== - -在神经网络训练过程中,经常会需要载入预定义模型,进而继续进行训练。\ -如何载入预定义参数,请参考 :ref:`user_guide_save_load_vars`。 - - -单卡训练 -######## - -执行单卡训练可以使用 :code:`fluid.Executor()` 中的 :code:`run()` 方法,运行训练\ -:code:`fluid.Program` 即可。在运行的时候,用户可以通过 :code:`run(feed=...)`\ -参数传入数据;用户可以通过 :code:`run(fetch=...)` 获取持久的数据。例如:\ - -.. code-block:: python - - ... - loss = fluid.layers.mean(...) - - exe = fluid.Executor(...) - # the result is an numpy array - result = exe.run(feed={"image": ..., "label": ...}, fetch_list=[loss]) - -这里有几点注意事项: - -1. feed的数据格式,请参考文章 :ref:`user_guide_feed_data_to_executor`。 -2. :code:`Executor.run` 的返回值是 :code:`fetch_list=[...]` 的variable值。被fetch\ - 的Variable必须是persistable的。 :code:`fetch_list` 可以传入Variable的列表,\ - 也可以传入Variable的名字列表。:code:`Executor.run` 返回Fetch结果列表。 -3. 如果需要取回的数据包含序列信息,可以设置 - :code:`exe.run(return_numpy=False, ...)` 直接返回 :code:`fluid.LoDTensor` - 。用户可以直接访问 :code:`fluid.LoDTensor` 中的信息。 - -多卡训练 -######## - -执行多卡训练可以使用 :code:`fluid.ParallelExecutor` 运行训练 -:code:`fluid.Program`。例如: - -.. code-block:: python - - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name, - main_program=fluid.default_main_program()) - train_exe.run(fetch_list=[loss.name], feed={...}) - -这里有几点注意事项: - -1. :code:`ParallelExecutor` 的构造函数需要指明要执行的 :code:`fluid.Program` , - 并在执行过程中不能修改。默认值是 :code:`fluid.default_main_program()` 。 -2. :code:`ParallelExecutor` 需要明确指定是否使用 CUDA 显卡进行训练。在显卡训练\ - 模式下会占用全部显卡。用户可以配置 `CUDA_VISIBLE_DEVICES `_ 来修改占用\ - 的显卡。 - -进阶使用 -######## - -.. toctree:: - :maxdepth: 2 - - test_while_training - save_load_variables diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle deleted file mode 100644 index 16f6b8835c4ffb82babca56b62ba44494fd6a947..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.graffle and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png deleted file mode 100644 index 587a1a48affdde6809d7f8bf77e1055db7cd8c14..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_nccl2.png and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle deleted file mode 100644 index 046c4903231e8ca441884674c08b381766c0bbae..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.graffle and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png b/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png deleted file mode 100644 index cd2f92ad1a14ac12efc2c257c8aa3d1ae403b2b1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/training/src/dist_train_pserver.png and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png b/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png deleted file mode 100644 index 6c078b5241559a05219447db67b5d8a35aeefd3f..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/user_guides/howto/training/src/parallelism.png and /dev/null differ diff --git a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst b/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst deleted file mode 100644 index 37d5c0d78179ccead7a81dffb4ae2f0d835a5949..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/howto/training/test_while_training.rst +++ /dev/null @@ -1,120 +0,0 @@ -.. _user_guide_test_while_training: - -################## -训练过程中评测模型 -################## - -模型的测试评价与训练的 :code:`fluid.Program` 不同。在测试评价中: - -1. 评价测试不进行反向传播,不优化更新参数。 -2. 评价测试执行的操作可以不同。 - - * 例如 BatchNorm 操作,在训练和测试时执行不同的算法。 - - * 评价模型与训练相比可以是完全不同的模型。 - -生成测试 :code:`fluid.Program` -################################# - -通过克隆训练 :code:`fluid.Program` 生成测试 :code:`fluid.Program` -======================================================================= - -:code:`Program.clone()` 方法可以复制出新的 :code:`fluid.Program` 。 通过设置 -:code:`Program.clone(for_test=True)` 复制含有用于测试的操作Program。简单的使用方法如下: - -.. code-block:: python - - import paddle.fluid as fluid - - img = fluid.layers.data(name="image", shape=[784]) - prediction = fluid.layers.fc( - input=fluid.layers.fc(input=img, size=100, act='relu'), - size=10, - act='softmax' - ) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - loss = fluid.layers.mean(fluid.layers.cross_entropy(input=prediction, label=label)) - acc = fluid.layers.accuracy(input=prediction, label=label) - - test_program = fluid.default_main_program().clone(for_test=True) - - adam = fluid.optimizer.Adam(learning_rate=0.001) - adam.minimize(loss) - -在使用 :code:`Optimizer` 之前,将 :code:`fluid.default_main_program()` 复制\ -成一个 :code:`test_program` 。之后使用测试数据运行 :code:`test_program`,\ -就可以做到运行测试程序,而不影响训练结果。 - -分别配置训练 :code:`fluid.Program` 和测试 :code:`fluid.Program` -===================================================================== - -如果训练程序和测试程序相差较大时,用户也可以通过完全定义两个不同的 -:code:`fluid.Program`,分别进行训练和测试。在PaddlePaddle Fluid中,\ -所有的参数都有名字。如果两个不同的操作,甚至两个不同的网络使用了同样名字的参数,\ -那么他们的值和内存空间都是共享的。 - -PaddlePaddle Fluid中使用 :code:`fluid.unique_name` 包来随机初始化用户未定义的\ -参数名称。通过 :code:`fluid.unique_name.guard` 可以确保多次调用某函数\ -参数初始化的名称一致。 - -例如: - -.. code-block:: python - - import paddle.fluid as fluid - - def network(is_test): - file_obj = fluid.layers.open_files(filenames=["test.recordio"] if is_test else ["train.recordio"], ...) - img, label = fluid.layers.read_file(file_obj) - hidden = fluid.layers.fc(input=img, size=100, act="relu") - hidden = fluid.layers.batch_norm(input=hidden, is_test=is_test) - ... - return loss - - with fluid.unique_name.guard(): - train_loss = network(is_test=False) - sgd = fluid.optimizer.SGD(0.001) - sgd.minimize(train_loss) - - test_program = fluid.Program() - with fluid.unique_name.guard(): - with fluid.program_gurad(test_program, fluid.Program()): - test_loss = network(is_test=True) - - # fluid.default_main_program() is the train program - # fluid.test_program is the test program - -执行测试 :code:`fluid.Program` -################################# - -使用 :code:`Executor` 执行测试 :code:`fluid.Program` -======================================================= - -用户可以使用 :code:`Executor.run(program=...)` 来执行测试 -:code:`fluid.Program`。 - -例如 - -.. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - test_acc = exe.run(program=test_program, feed=test_data_batch, fetch_list=[acc]) - print 'Test accuracy is ', test_acc - -使用 :code:`ParallelExecutor` 执行测试 :code:`fluid.Program` -=============================================================== - -用户可以使用训练用的 :code:`ParallelExecutor` 与测试 :code:`fluid.Program` -一起新建一个测试的 :code:`ParallelExecutor` ;再使用测试 -:code:`ParallelExecutor.run` 来执行测试。 - -例如: - -.. code-block:: python - - train_exec = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - - test_exec = fluid.ParallelExecutor(use_cuda=True, share_vars_from=train_exec, - main_program=test_program) - test_acc = test_exec.run(fetch_list=[acc], ...) - diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst deleted file mode 100644 index 377631109d8f65c149b12cd2a0e4da920fdf4def..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -######## -使用指南 -######## - - -.. todo:: - - 完善导引介绍 - -.. toctree:: - :maxdepth: 2 - - howto/prepare_data/index - howto/configure_simple_model/index - howto/training/index - howto/debug/index - howto/evaluation/index - howto/inference/index - models/index.rst diff --git a/doc/fluid/new_docs/user_guides/models/index.rst b/doc/fluid/new_docs/user_guides/models/index.rst deleted file mode 100644 index 998e95c4885dc313d9449f5466f80c53d34fe82a..0000000000000000000000000000000000000000 --- a/doc/fluid/new_docs/user_guides/models/index.rst +++ /dev/null @@ -1,137 +0,0 @@ -Fluid 模型库 -============ - -图像分类 --------- - -图像分类是根据图像的语义信息对不同类别图像进行区分,是计算机视觉中重要的基础问题,是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础,在许多领域都有着广泛的应用。如:安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。 - -在深度学习时代,图像分类的准确率大幅度提升,在图像分类任务中,我们向大家介绍了如何在经典的数据集ImageNet上,训练常用的模型,包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual -Path -Network)、SE-ResNeXt模型,也开源了\ `训练的模型 `__\ 方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle -Fluid模型配置和参数文件的工具。 - -- `AlexNet `__ -- `VGG `__ -- `GoogleNet `__ -- `Residual - Network `__ -- `Inception-v4 `__ -- `MobileNet `__ -- `Dual Path - Network `__ -- `SE-ResNeXt `__ -- `Caffe模型转换为Paddle - Fluid配置和模型文件工具 `__ - -目标检测 --------- - -目标检测任务的目标是给定一张图像或是一个视频帧,让计算机找出其中所有目标的位置,并给出每个目标的具体类别。对于人类来说,目标检测是一个非常简单的任务。然而,计算机能够“看到”的是图像被编码之后的数字,很难解图像或是视频帧中出现了人或是物体这样的高层语义概念,也就更加难以定位目标出现在图像中哪个区域。与此同时,由于目标会出现在图像或是视频帧中的任何位置,目标的形态千变万化,图像或是视频帧的背景千差万别,诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。 - -在目标检测任务中,我们介绍了如何基于\ `PASCAL -VOC `__\ 、\ `MS -COCO `__\ 数据训练通用物体检测模型,当前介绍了SSD算法,SSD全称Single Shot MultiBox Detector,是目标检测领域较新且效果较好的检测算法之一,具有检测速度快且检测精度高的特点。 - -开放环境中的检测人脸,尤其是小的、模糊的和部分遮挡的人脸也是一个具有挑战的任务。我们也介绍了如何基于 `WIDER FACE `_ 数据训练百度自研的人脸检测PyramidBox模型,该算法于2018年3月份在WIDER FACE的多项评测中均获得 `第一名 `_。 - -- `Single Shot MultiBox - Detector `__ -- `Face Detector: PyramidBox `_ - -图像语义分割 ------------- - -图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割,图像语义是指对图像内容的理解,例如,能够描绘出什么物体在哪里做了什么事情等,分割是指对图片中的每个像素点进行标注,标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。 - -在图像语义分割任务中,我们介绍如何基于图像级联网络(Image Cascade -Network,ICNet)进行语义分割,相比其他分割算法,ICNet兼顾了准确率和速度。 - -- `ICNet `__ - -场景文字识别 ------------- - -许多场景图像中包含着丰富的文本信息,对理解图像信息有着重要作用,能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下,将图像信息转化为文字序列的过程,可认为是一种特别的翻译过程:将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生,如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。 - -在场景文字识别任务中,我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合,免除人工定义特征,避免字符分割,使用自动学习到的图像特征,完成端到端地无约束字符定位和识别。当前,介绍了CRNN-CTC模型,后续会引入基于注意力机制的序列到序列模型。 - -- `CRNN-CTC模型 `__ - -语音识别 --------- - -自动语音识别(Automatic Speech Recognition, -ASR)是将人类声音中的词汇内容转录成计算机可输入的文字的技术。语音识别的相关研究经历了漫长的探索过程,在HMM/GMM模型之后其发展一直较为缓慢,随着深度学习的兴起,其迎来了春天。在多种语言识别任务中,将深度神经网络(DNN)作为声学模型,取得了比GMM更好的性能,使得 -ASR -成为深度学习应用最为成功的领域之一。而由于识别准确率的不断提高,有越来越多的语言技术产品得以落地,例如语言输入法、以智能音箱为代表的智能家居设备等 -—— 基于语言的交互方式正在深刻的改变人类的生活。 - -与 `DeepSpeech `__ -中深度学习模型端到端直接预测字词的分布不同,本实例更接近传统的语言识别流程,以音素为建模单元,关注语言识别中声学模型的训练,利用\ `kaldi `__\ 进行音频数据的特征提取和标签对齐,并集成 -kaldi 的解码器完成解码。 - -- `DeepASR `__ - -机器翻译 --------- - -机器翻译(Machine -Translation)将一种自然语言(源语言)转换成一种自然语言(目标语音),是自然语言处理中非常基础和重要的研究方向。在全球化的浪潮中,机器翻译在促进跨语言文明的交流中所起的重要作用是不言而喻的。其发展经历了统计机器翻译和基于神经网络的神经机器翻译(Nueural -Machine Translation, NMT)等阶段。在 NMT -成熟后,机器翻译才真正得以大规模应用。而早阶段的 NMT -主要是基于循环神经网络 RNN -的,其训练过程中当前时间步依赖于前一个时间步的计算,时间步之间难以并行化以提高训练速度。因此,非 -RNN 结构的 NMT 得以应运而生,例如基于卷积神经网络 CNN -的结构和基于自注意力机制(Self-Attention)的结构。 - -本实例所实现的 Transformer -就是一个基于自注意力机制的机器翻译模型,其中不再有RNN或CNN结构,而是完全利用 -Attention 学习语言中的上下文依赖。相较于RNN/CNN, -这种结构在单层内计算复杂度更低、易于并行化、对长程依赖更易建模,最终在多种语言之间取得了最好的翻译效果。 - -- `Transformer `__ - -强化学习 --------- - -强化学习是近年来一个愈发重要的机器学习方向,特别是与深度学习相结合而形成的深度强化学习(Deep -Reinforcement Learning, -DRL),取得了很多令人惊异的成就。人们所熟知的战胜人类顶级围棋职业选手的 -AlphaGo 就是 DRL -应用的一个典型例子,除游戏领域外,其它的应用还包括机器人、自然语言处理等。 - -深度强化学习的开山之作是在Atari视频游戏中的成功应用, -其可直接接受视频帧这种高维输入并根据图像内容端到端地预测下一步的动作,所用到的模型被称为深度Q网络(Deep -Q-Network, DQN)。本实例就是利用PaddlePaddle Fluid这个灵活的框架,实现了 -DQN 及其变体,并测试了它们在 Atari 游戏中的表现。 - -- `DeepQNetwork `__ - -中文词法分析 ------------- - -中文分词(Word Segmentation)是将连续的自然语言文本,切分出具有语义合理性和完整性的词汇序列的过程。因为在汉语中,词是承担语义的最基本单位,切词是文本分类、情感分析、信息检索等众多自然语言处理任务的基础。 词性标注(Part-of-speech Tagging)是为自然语言文本中的每一个词汇赋予一个词性的过程,这里的词性包括名词、动词、形容词、副词等等。 命名实体识别(Named Entity Recognition,NER)又称作“专名识别”,是指识别自然语言文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等。 我们将这三个任务统一成一个联合任务,称为词法分析任务,基于深度神经网络,利用海量标注语料进行训练,提供了一个端到端的解决方案。 - -我们把这个联合的中文词法分析解决方案命名为LAC。LAC既可以认为是Lexical Analysis of Chinese的首字母缩写,也可以认为是LAC Analyzes Chinese的递归缩写。 - -- `LAC `__ - -情感倾向分析 ------------- - -情感倾向分析针对带有主观描述的中文文本,可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极、 中性。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控,为企业提供有力的决策支持。本次我们开放 AI开放平台中情感倾向分析采用的模型(http://ai.baidu.com/tech/nlp/sentiment_classify ), 提供给用户使用。 - -- `Senta `__ - -AnyQ ----- - -`AnyQ `__\ (ANswer Your Questions) -开源项目主要包含面向FAQ集合的问答系统框架、文本语义匹配工具SimNet。 -问答系统框架采用了配置化、插件化的设计,各功能均通过插件形式加入,当前共开放了20+种插件。开发者可以使用AnyQ系统快速构建和定制适用于特定业务场景的FAQ问答系统,并加速迭代和升级。 - -SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架,该框架在百度各产品上广泛应用,主要包括BOW、CNN、RNN、MM-DNN等核心网络结构形式,同时基于该框架也集成了学术界主流的语义匹配模型,如MatchPyramid、MV-LSTM、K-NRM等模型。使用SimNet构建出的模型可以便捷的加入AnyQ系统中,增强AnyQ系统的语义匹配能力。 - -- `SimNet in PaddlePaddle - Fluid `__ diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md deleted file mode 100644 index bb6d4563f5617fb98af055bca2f6f0479bdb4393..0000000000000000000000000000000000000000 --- a/doc/fluid/read_source.md +++ /dev/null @@ -1,67 +0,0 @@ -# PaddlePaddle Fluid Source Code Overview - -Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book - -Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework - -Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators - -Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory - -Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform - -# Compile Time - -The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto). - -```python -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -y_predict = fluid.layers.fc(input=x, size=1, act=None) -cost = fluid.layers.square_error_cost(input=y_predict, label=y) -avg_cost = fluid.layers.mean(x=cost) - -sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) -sgd_optimizer.minimize(avg_cost) -``` - -- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#) -- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers) - - Every Layer has one or more operators and variables/parameters - - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files: - - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h) - - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h) - - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h) -- Optimizer: `fluid.optimizer.SGD`. It does the following - - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)] - - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)] - -# Run Time - -The following **evaluates** the NN. Instantiates all the variables, operators. - -```python -place = fluid.CPUPlace() -feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) -exe = fluid.Executor(place) - -# Allocate memory. Initialize Parameter. -exe.run(fluid.default_startup_program()) - -# Allocate memory. Do computation. -exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) -``` - -- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h) - - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h) -- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)] - - Feeds the data: `feed=feeder.feed(data)` - - Evaluates all the operators - - Fetches the result: `fetch_list=[avg_cost]` -- Other worth looking files: - - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live - - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live - - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e362d3486487dd0b55e3e40d1c1358f2e5604ac5..f61d1254fd1419490c483532ff15257e5d8f4507 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,17 +1,10 @@ paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -61,8 +54,6 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) -paddle.fluid.InferenceTranspiler.__init__ -paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ @@ -86,6 +77,7 @@ paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) +paddle.fluid.initializer.TruncatedNormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)) paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) @@ -107,7 +99,7 @@ paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_ paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, True)) +paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) @@ -116,6 +108,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -130,7 +123,7 @@ paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) -paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) +paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) @@ -161,6 +154,7 @@ paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', ' paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -174,8 +168,9 @@ paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=Non paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)) paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) -paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None) @@ -235,12 +230,6 @@ paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=Non paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ParallelDo.read_input ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.ParallelDo.write_output ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -348,8 +337,6 @@ paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=[ paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) -paddle.fluid.transpiler.InferenceTranspiler.__init__ -paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index d998109df21f585bc4905e00e59fe07247fd3f5e..6d8cbe5d9e491555a94e9420995149041213ab79 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -148,13 +148,13 @@ if(WITH_DISTRIBUTE) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) endif() - + if (NOT WIN32) -cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor - graph graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fast_threaded_ssa_graph_executor) + cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + graph graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass) endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 6bcfc6cd55f02f0d4f0f6e3170e7cc19ce666a28..fee6ba40047053ed5662fe044eceb0c687bd4db9 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -25,6 +25,10 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, in.place().which(), dst_place.which(), "Currently, model parallelism is only supported between CPU and CUDA"); + // NOTE(yy): TransDataDevice should wait for computation of input. + platform::DeviceContextPool::Instance().Get(in.place())->Wait(); + platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); + // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and // the enforced checkings have been done in GetDeviceContext, so the // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index 1413f7bd9ac515ae7dceee62de8f3bc74e3a2efc..ab7412a19fbd13fa39dbae9af528d158cc9ddbd0 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -96,8 +96,8 @@ struct TestBroadcastOpHandle { } param_scopes_[input_scope_idx]->Var("input"); - std::unique_ptr n( - new ir::Node("node0", ir::Node::Type::kOperation)); + std::unique_ptr n = + ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); if (use_gpu_) { #ifdef PADDLE_WITH_CUDA op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, @@ -115,8 +115,8 @@ struct TestBroadcastOpHandle { #endif } - std::unique_ptr v( - new ir::Node("node1", ir::Node::Type::kVariable)); + std::unique_ptr v = + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", gpu_list_[input_scope_idx]); vars_.emplace_back(in_var_handle); @@ -124,8 +124,8 @@ struct TestBroadcastOpHandle { // add dummy var - std::unique_ptr v2( - new ir::Node("node2", ir::Node::Type::kVariable)); + std::unique_ptr v2 = + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); vars_.emplace_back(new DummyVarHandle(v2.get())); DummyVarHandle* dummy_var_handle = static_cast(vars_.back().get()); @@ -136,8 +136,8 @@ struct TestBroadcastOpHandle { if (!use_gpu_) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); } - std::unique_ptr v3( - new ir::Node("node3", ir::Node::Type::kVariable)); + std::unique_ptr v3 = + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); VarHandle* out_var_handle = new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]); vars_.emplace_back(out_var_handle); @@ -145,8 +145,8 @@ struct TestBroadcastOpHandle { } // add dummy var - std::unique_ptr v4( - new ir::Node("node4", ir::Node::Type::kVariable)); + std::unique_ptr v4 = + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); vars_.emplace_back(new DummyVarHandle(v4.get())); DummyVarHandle* out_dummy_var_handle = static_cast(vars_.back().get()); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8714a42162bda3d5ad12e7925fe8cc4e693f51b1..77cafa49f18158ea4187908856e10adaa0f916c9 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -54,6 +54,8 @@ struct BuildStrategy { std::string debug_graphviz_path_{""}; + bool fuse_elewise_add_act_ops_{false}; + bool enable_data_balance_{false}; }; diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 21f75957be5f33f3dfc09c41fa9a1e1ca590f99e..4fb015b0ffe27e6fa91d4eaf373fca4feca66361 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -20,79 +20,41 @@ namespace paddle { namespace framework { namespace details { -// Change it to thread safe flags if needed. -class ThreadUnsafeOwnershipFlags { +template +class COWPtr { public: - explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} - - ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags& operator=( - const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; + typedef std::shared_ptr RefPtr; - void SetOwnership(bool flag) { flag_ = flag; } + private: + RefPtr m_sp; - // Invoke the callback if it is not owned. - template - void AcquireOwnershipOnce(Callback acquire) { - if (!flag_) { - acquire(); - flag_ = true; + void detach() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + m_sp = RefPtr(new T(*tmp)); } } - private: - bool flag_; -}; - -// Copy-On-Write pointer. -// It will hold a T* pointer, and only copy once when `MutableData` is invoked. -// -// The template parameter OwnershipFlags should have: -// * a constructor takes a bool. True if own. -// * SetOwnership(bool flag). -// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not -// owned. -// -// https://en.wikipedia.org/wiki/Copy-on-write -template -class COWPtr { public: - // Ctor from raw pointer. - explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} + COWPtr() : m_sp(nullptr) {} + explicit COWPtr(T* t) : m_sp(t) {} + explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {} - // Move methods. Steal ownership from origin - COWPtr(COWPtr&& other) - : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} - COWPtr& operator=(COWPtr&& origin) = default; + const T& Data() const { return operator*(); } - // Copy methods. Not own payload - COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} - COWPtr& operator=(const COWPtr& other) { - payload_ = other.payload_; - ownership_.SetOwnership(false); - return *this; - } - - // Access read only data. - const T& Data() const { return *payload_; } + T* MutableData() { return operator->(); } - // Access mutable data. If the data is not owned, the data will be copied - // before. - T* MutableData() { - ownership_.AcquireOwnershipOnce( - [this] { payload_.reset(new T(*payload_)); }); - return payload_.get(); + const T& operator*() const { return *m_sp; } + T& operator*() { + detach(); + return *m_sp; + } + const T* operator->() const { return m_sp.operator->(); } + T* operator->() { + detach(); + return m_sp.operator->(); } - - private: - // Actual data pointer. - std::shared_ptr payload_; - - // Ownership flag. - OwnershipFlags ownership_; }; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index d2142af277c0b356d83941b3baab1947cce31dac..5b055d7cb4d127dc20f2cf70869134f24a93d429 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -30,6 +30,14 @@ TEST(COWPtr, all) { ASSERT_EQ(ptr2.Data(), 10); } +TEST(COWPtr, change_old) { + COWPtr ptr(new int{0}); + COWPtr ptr2 = ptr; + *ptr.MutableData() = 10; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(ptr.Data(), 10); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 7606f2bc06b2ecf07c5649eeae1a2d5587a8880c..6e22fedf1c39428528c00cce4c9a4460dfb95cb3 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -54,7 +54,6 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( paddle::framework::FeedFetchList fetches; fetches.resize(fetch_tensors.size()); std::unordered_map> fetched_vars; - std::vector> fetch_nodes; std::vector> fetch_ops; for (auto &fetch_var_name : fetch_tensors) { @@ -75,9 +74,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( auto &vars = fetched_var_it->second; - fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); - auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i, - &local_scopes_); + ir::Node *fetch_node = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); + auto *op = new FetchOpHandle(fetch_node, &fetches, i, &local_scopes_); fetch_ops.emplace_back(op); for (auto &p : places_) { @@ -116,9 +115,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( num_complete += num_comp; } // Wait FetchOps. - if (!fetch_ops.empty()) { - fetch_ops.clear(); - } + ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } void FastThreadedSSAGraphExecutor::RunOpAsync( diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index c9b94d1e1039df6ff27f9ffe225b2a50c35a5c50..ed67e88ff6a7fe9efd93e5dfd4d7bdf4c43aac2e 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -82,13 +82,15 @@ struct TestGatherOpHandle { } param_scopes_[input_scope_idx]->Var("out"); - nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation)); + nodes.emplace_back( + ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release()); op_handle_.reset( new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_)); // add input for (size_t j = 0; j < gpu_list_.size(); ++j) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); - nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable)); + nodes.emplace_back( + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release()); auto* in_var_handle = new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]); vars_.emplace_back(in_var_handle); @@ -96,7 +98,8 @@ struct TestGatherOpHandle { } // add dummy var - nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable)); + nodes.emplace_back( + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release()); vars_.emplace_back(new DummyVarHandle(nodes.back().get())); DummyVarHandle* in_dummy_var_handle = static_cast(vars_.back().get()); @@ -104,14 +107,16 @@ struct TestGatherOpHandle { op_handle_->AddInput(in_dummy_var_handle); // add output - nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable)); + nodes.emplace_back( + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release()); auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx, "out", gpu_list_[input_scope_idx]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); // add dummy var - nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable)); + nodes.emplace_back( + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release()); vars_.emplace_back(new DummyVarHandle(nodes.back().get())); DummyVarHandle* dummy_var_handle = static_cast(vars_.back().get()); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 250e093a5f789dba6b06df4889c060c294d469fe..8f319116ab80b75c624f35b0e1315e7362e88d9a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; void MultiDevSSAGraphBuilder::Init() const { + all_vars_.clear(); + balance_vars_.clear(); + loss_var_name_ = Get(kLossVarName); places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 1ca8c4b855f9468589e537245380451a91a50b14..47aaa80f4d66a48b729d0638badcab885a50585c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t device_id) const; void Init() const; - private: - mutable std::string loss_var_name_; - mutable std::vector places_; - mutable std::vector local_scopes_; - mutable std::unordered_set grad_names_; - #ifdef PADDLE_WITH_CUDA mutable platform::NCCLContextMap *nccl_ctxs_; #endif @@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t GetAppropriateDeviceID( const std::vector &var_names) const; - private: + void SetCommunicationContext(OpHandleBase *op_handle, + const platform::Place &p) const; + + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + mutable std::unordered_set grad_names_; + mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; mutable std::vector balance_vars_; - - void SetCommunicationContext(OpHandleBase *op_handle, - const platform::Place &p) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc index 09b97bd0d98dc4ad1124dcbc495cff921bf03efc..780da5478ff34ecd7096d0ef62b72bf1088dd221 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -19,6 +19,19 @@ namespace framework { namespace details { SSAGraphExecutor::~SSAGraphExecutor() {} +void ClearFetchOp(ir::Graph* graph, + std::vector>* fetch_ops) { + if (fetch_ops->empty()) return; + + for (auto& op : *fetch_ops) { + for (auto& out_var : op->Node()->outputs) { + graph->RemoveNode(out_var); + } + graph->RemoveNode(op->Node()); + } + fetch_ops->clear(); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 96fffb7d9430cd00b3823ada9fbe9a65a6bd718c..d5cf7737d565c523995e6685b73c57e5a6f0197b 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -18,6 +18,7 @@ #include #include +#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/ir/graph.h" @@ -36,6 +37,9 @@ class SSAGraphExecutor { virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; }; + +void ClearFetchOp(ir::Graph* graph, + std::vector>* fetch_ops); } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c9e331ef359f853263f8dad38dd0a2be4d9618ad..31beef3ae829d72570ee7c879dac71ed600cd216 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -69,12 +69,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Step 2. Insert FetchOps std::vector> fetch_ops; - std::vector> tmp_nodes; std::unordered_set> fetch_dependencies; FeedFetchList fetch_data(fetch_tensors.size()); - InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies, - &pending_ops, &pending_vars, &ready_vars, &fetch_data); + InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, + &pending_vars, &ready_vars, &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { @@ -136,9 +135,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( PADDLE_ENFORCE(ready_ops.empty()); // Wait FetchOps. - if (!fetch_ops.empty()) { - fetch_ops.clear(); - } + ClearFetchOp(graph_.get(), &fetch_ops); return fetch_data; } @@ -146,7 +143,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector> *fetch_ops, - std::vector> *temp_nodes, std::unordered_set> *fetch_dependencies, std::unordered_map *pending_ops, std::unordered_set *pending_vars, @@ -171,9 +167,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( auto &vars = fetched_var_it->second; - temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); - auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i, - &local_scopes_); + ir::Node *fetch_node = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); + auto *op = new FetchOpHandle(fetch_node, fetch_data, i, &local_scopes_); fetch_ops->emplace_back(op); for (auto &p : places_) { @@ -184,8 +180,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( op->AddInput(var); } - temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); - auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get()); + ir::Node *fetch_var = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kVariable); + auto *fetch_dummy = new DummyVarHandle(fetch_var); op->AddOutput(fetch_dummy); fetch_dependencies->emplace(fetch_dummy); this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 9135c1f5d435d5e2c60eb90c80803361aa31a3c4..512f8a4ca5a9b82a395dde11722b8db44ea5ec27 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -73,7 +73,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { void InsertFetchOps( const std::vector &fetch_tensors, std::vector> *fetch_ops, - std::vector> *temp_nodes, std::unordered_set> *fetch_dependencies, std::unordered_map *pending_ops, std::unordered_set *pending_vars, diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index b4d3fa25c35fbf25b3d2fdd9fa1045dda0f773ec..9bccb1a32bf63b30351ef4428594691b0eef0b6a 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -129,6 +129,9 @@ class GradOpDescMakerBase { std::string ForwardOpType() const { return this->fwd_op_.Type(); } + protected: + const OpDesc& ForwardOp() const { return fwd_op_; } + private: const OpDesc& fwd_op_; const std::unordered_set& no_grad_set_; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 7004f484a9975124750fad4cb8f773342082b514..4dca3ceb4569fb708c7a98621c5239acbe217586 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -37,6 +37,8 @@ pass_library(fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) +cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) + set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..648acc4a759417240d9a39749b059289182ebb1e --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -0,0 +1,374 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr FuseElewiseAddActPass::ApplyImpl( + std::unique_ptr graph) const { + std::unordered_set act_types = {"relu", "scale"}; + graph = FuseActElewiseAdd(std::move(graph), act_types); + graph = FuseElewiseAddAct(std::move(graph), act_types); + // backward + { + std::unordered_set in_place_act_types = {"relu_grad"}; + graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types); + } + + // Remove the removable intermediate_out. + RemoveIntermediateOut(graph.get()); + + return graph; +} + +// ele_add(x, act(y)) +std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddAct( + std::unique_ptr graph, + const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("elewise_add_act", graph.get()); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("elewise_add_act/x") + ->AsInput() + ->assert_is_op_input("elementwise_add", "X"); + patterns::ElewiseAddAct elewise_add_act_pattern(gpd.mutable_pattern(), + "elementwise_add"); + + elewise_add_act_pattern(x, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddAct fuse"; + GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, + elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, act, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, elewise_add_act_pattern); + + std::string ele_x_n = subgraph.at(x)->Name(); + std::string ele_y_n = ele_y->Name(); + std::string ele_out_n = ele_out->Name(); + std::string act_out_n = act_out->Name(); + + Node *elewise_add_act_node = CreateFuseElewiseAddActNode( + g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n); + + VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> " + << ele_add->Name() << " -> " << ele_out_n << "\n" + << "\t " << ele_out_n << " -> " << act->Name() << " -> " + << act_out_n; + + ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node); + found_elewise_add_act_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +// act(ele_add(x,y)) +std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( + std::unique_ptr graph, + const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("act_elewise_add", graph.get()); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("act_elewise_add/x") + ->AsInput() + ->assert_is_ops_input(act_types, "X"); + patterns::ActElewiseAdd act_elewise_add_pattern(gpd.mutable_pattern(), + "act_elewise_add"); + + act_elewise_add_pattern(x, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddAct fuse"; + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, + act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, act, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, act_elewise_add_pattern); + + std::string act_i_n = subgraph.at(x)->Name(); + std::string act_o_n = act_out->Name(); + std::string elewise_add_x_n = ele_x->Name(); + std::string elewise_add_out_n = ele_out->Name(); + + Node *elewise_add_act_node = CreateFuseElewiseAddActNode( + g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n); + + VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n + << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> " + << ele_add->Name() << " -> " << elewise_add_out_n; + + ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node); + found_elewise_add_act_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +// the backward of act(ele_add(x,y)) +// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] +// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] +std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( + std::unique_ptr graph, + const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("elewise_add_act_grad", graph.get()); + + GraphPatternDetector gpd; + auto *d_act_out = gpd.mutable_pattern() + ->NewNode("elewise_add_act_grad_inplace/x") + ->AsInput() + ->assert_is_ops_input(act_types, GradVarName("Out")); + patterns::ElewiseAddActInplaceGrad elewise_add_act_grad_pattern( + gpd.mutable_pattern(), "elewise_add_act_grad_inplace"); + elewise_add_act_grad_pattern(d_act_out, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddActGrad1 fuse"; + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out, + elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad, ele_add_grad, + elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_ele_x, d_ele_x, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_ele_y, d_ele_y, elewise_add_act_grad_pattern); + + std::string d_act_out_n = subgraph.at(d_act_out)->Name(); + std::string act_out_n = act_out->Name(); + std::string d_itermediate_out_n = d_itermediate_out->Name(); + std::string ele_y_n = ele_y->Name(); + std::string d_ele_x_n = d_ele_x->Name(); + std::string d_ele_y_n = d_ele_y->Name(); + + OpDesc desc; + desc.SetType("fused_elemwise_activation_grad"); + desc.SetInput("IntermediateOut", {}); + desc.SetInput("X", {}); + desc.SetInput("Y", std::vector({ele_y_n})); + desc.SetInput("Out", std::vector({act_out_n})); + desc.SetInput(GradVarName("Out"), std::vector({d_act_out_n})); + desc.SetOutput(GradVarName("X"), std::vector({d_ele_x_n})); + desc.SetOutput(GradVarName("Y"), std::vector({d_ele_y_n})); + desc.SetOutput(GradVarName("IntermediateOut"), + std::vector({d_itermediate_out_n})); + + desc.SetAttr("save_intermediate_out", false); + desc.SetAttr("functor_list", + std::vector( + {act_grad->Op()->Type(), ele_add_grad->Op()->Type()})); + + for (auto &n : {act_grad->Op(), ele_add_grad->Op()}) { + for (auto &m_ele : n->GetAttrMap()) { + desc.SetAttr(m_ele.first, m_ele.second); + } + } + + auto fused_node = g->CreateOpNode(&desc); + + VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " + << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " + << d_itermediate_out_n << " and " << act_out_n << " -> " + << ele_add_grad->Name() << " -> " << d_itermediate_out_n; + + ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node); + found_elewise_add_act_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode( + Graph *g, const Node *op_1, const Node *op_2, const std::string &ele_x_n, + const std::string &ele_y_n, const std::string &ele_out_n, + const std::string &act_out_n) const { + OpDesc desc; + desc.SetInput("X", std::vector({ele_x_n})); + desc.SetInput("Y", std::vector({ele_y_n})); + desc.SetOutput("Out", std::vector({act_out_n})); + desc.SetOutput("IntermediateOut", std::vector({ele_out_n})); + desc.SetType("fused_elemwise_activation"); + desc.SetAttr("save_intermediate_out", true); + desc.SetAttr("functor_list", std::vector( + {op_1->Op()->Type(), op_2->Op()->Type()})); + + // Set attrs + for (auto &n : {op_1->Op(), op_2->Op()}) { + for (auto &m_ele : n->GetAttrMap()) { + desc.SetAttr(m_ele.first, m_ele.second); + } + } + + auto elewise_add_act_node = g->CreateOpNode(&desc); + return elewise_add_act_node; +} + +void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const { + std::unordered_set need_removed_nodes; + for (auto &cur_node : graph->Nodes()) { + if (cur_node->IsVar()) continue; + if (cur_node->Name() == "fused_elemwise_activation") { + bool save_intermediate_out = + boost::get(cur_node->Op()->GetAttr("save_intermediate_out")); + auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut"); + PADDLE_ENFORCE( + save_intermediate_out && !intermediate_out_args.empty(), + "The %s should save the intermediate_out in the fusing stage.", + cur_node->Name()); + + // If the intermediate_out's output is empty, it should be removed. + auto cur_node_outputs = cur_node->outputs; + for (auto &out : cur_node_outputs) { + if (out->Name() == intermediate_out_args[0]) { + if (out->outputs.size() == 0) { + cur_node->outputs = this->RemoveNode(out, cur_node->outputs); + need_removed_nodes.insert(std::move(out)); + cur_node->Op()->SetAttr("save_intermediate_out", false); + } + } + } + } else if (cur_node->Name() == "fused_elemwise_activation_grad") { + auto intermediate_out_grad_args = + cur_node->Op()->Output(GradVarName("IntermediateOut")); + PADDLE_ENFORCE( + !intermediate_out_grad_args.empty(), + "The %s should save the intermediate_out in the fusing stage.", + cur_node->Name()); + auto cur_node_outputs = cur_node->outputs; + // If the intermediate_out_g's output is empty, it should be removed. + for (auto &out : cur_node_outputs) { + if (out->Name() == intermediate_out_grad_args[0] && + out->outputs.empty()) { + cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {}); + cur_node->outputs = this->RemoveNode(out, cur_node->outputs); + need_removed_nodes.insert(std::move(out)); + } + } + } + } + GraphSafeRemoveNodes(graph, need_removed_nodes); +} + +void FuseElewiseAddActPass::ReLinkNodes(Graph *graph, + const Node *intermediate_out, + Node *op_1, Node *op_2, + Node *fused_op) const { // delete act + for (auto &in : op_1->inputs) { + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs); + } + + std::unordered_set nodes2delete; + for (auto &out : op_1->outputs) { + if (out->IsCtrlVar()) { + auto result_iter = std::find_if( + op_2->inputs.begin(), op_2->inputs.end(), + [&out](const Node *node) -> bool { return node == out; }); + + if (result_iter == op_2->inputs.end()) { + IR_OP_VAR_LINK(fused_op, out); + } else { + nodes2delete.emplace(out); + } + } else { + PADDLE_ENFORCE(out == intermediate_out); + IR_OP_VAR_LINK(fused_op, out); + } + } + + for (auto &in : op_2->inputs) { + if (in == intermediate_out || nodes2delete.count(in)) { + continue; + } + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs); + } + + for (auto &out : op_2->outputs) { + IR_OP_VAR_LINK(fused_op, out); + } + + nodes2delete.insert(std::move(op_1)); + nodes2delete.insert(std::move(op_2)); + + GraphSafeRemoveNodes(graph, nodes2delete); +} + +std::vector FuseElewiseAddActPass::ReplaceNode( + Node *cur_node, Node *new_node, const std::vector &nodes) const { + std::vector new_list(nodes.size()); + bool has_replaced = false; + std::transform(nodes.begin(), nodes.end(), new_list.begin(), + [&](Node *node) -> Node * { + if (node == cur_node) { + has_replaced = true; + return new_node; + } + return node; + }); + PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.", + cur_node->Name()); + return new_list; +} + +std::vector FuseElewiseAddActPass::RemoveNode( + Node *trg_node, const std::vector &nodes) const { + std::vector new_list(nodes.size()); + auto end_iter = + std::copy_if(nodes.begin(), nodes.end(), new_list.begin(), + [&](Node *node) -> bool { return node != trg_node; }); + new_list.resize( + static_cast(std::distance(new_list.begin(), end_iter))); + return new_list; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_elewise_add_act_pass, + paddle::framework::ir::FuseElewiseAddActPass); diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b2fecc076efca333539fe81e67eee222873aee2a --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h @@ -0,0 +1,75 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class FuseElewiseAddActPass : public FusePassBase { + public: + virtual ~FuseElewiseAddActPass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + std::unique_ptr FuseElewiseAddAct( + std::unique_ptr graph, + const std::unordered_set &act_types) const; + + std::unique_ptr FuseActElewiseAdd( + std::unique_ptr graph, + const std::unordered_set &act_types) const; + + std::unique_ptr FuseElewiseAddActInplaceGrad( + std::unique_ptr graph, + const std::unordered_set &act_types) const; + + /** + * Remove the removable intermediate_out. + * - If the intermediate_out is only used by the backward op, but the + * backward op doesn't use intermediate_out. + * - If the intermediate_out_grad is not used by any op. + */ + void RemoveIntermediateOut(Graph *graph) const; + + std::vector ReplaceNode(Node *cur_node, Node *new_node, + const std::vector &nodes) const; + + std::vector RemoveNode(Node *trg_node, + const std::vector &nodes) const; + + void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1, + Node *op_2, Node *fused_op) const; + Node *CreateFuseElewiseAddActNode(Graph *g, const Node *op_1, + const Node *op_2, + const std::string &ele_x_n, + const std::string &ele_y_n, + const std::string &ele_out_n, + const std::string &act_out_n) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 11d5998aafe1f325b94ef1a5ea1c13c72c13f5c9..ef5113819696238d4e06b826bf43064b0f368dea 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -20,10 +20,10 @@ #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" - namespace paddle { namespace framework { namespace ir { @@ -34,7 +34,7 @@ using string::Style; size_t PDPattern::id_ = 0UL; -PDNode* PDPattern::NewNode(const std::string& name) { +PDNode *PDPattern::NewNode(const std::string &name) { if (!name.empty()) { PADDLE_ENFORCE_EQ(node_map_.count(name), 0, "PDNode's name should be unique, get duplicate [%s]", @@ -42,12 +42,12 @@ PDNode* PDPattern::NewNode(const std::string& name) { } nodes_.emplace_back(new PDNode(this, name)); - auto* cur = nodes_.back().get(); + auto *cur = nodes_.back().get(); node_map_[name] = cur; return cur; } -PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { +PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) { if (!name.empty()) { PADDLE_ENFORCE_EQ(node_map_.count(name), 0, "PDNode's name should be unique, get duplicate [%s]", @@ -55,12 +55,12 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { } nodes_.emplace_back(new PDNode(std::move(teller), this, name)); - auto* cur = nodes_.back().get(); + auto *cur = nodes_.back().get(); node_map_[name] = cur; return cur; } -PDNode* PDPattern::RetrieveNode(const std::string& id) const { +PDNode *PDPattern::RetrieveNode(const std::string &id) const { auto it = node_map_.find(id); if (it == node_map_.end()) { return nullptr; @@ -69,14 +69,14 @@ PDNode* PDPattern::RetrieveNode(const std::string& id) const { return it->second; } -void PDPattern::AddEdge(PDNode* a, PDNode* b) { +void PDPattern::AddEdge(PDNode *a, PDNode *b) { PADDLE_ENFORCE(a); PADDLE_ENFORCE(b); PADDLE_ENFORCE(a != b, "can't connect to the same nodes."); edges_.emplace_back(a, b); } -void GraphPatternDetector::operator()(Graph* graph, +void GraphPatternDetector::operator()(Graph *graph, GraphPatternDetector::handle_t handler) { if (!MarkPDNodesInGraph(*graph)) { return; @@ -90,18 +90,18 @@ void GraphPatternDetector::operator()(Graph* graph, if (subgraphs.empty()) return; PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); int id = 0; - for (auto& g : subgraphs) { + for (auto &g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; handler(g, graph); } } -bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { +bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { VLOG(3) << "mark pdnodes in graph"; if (graph.Nodes().empty()) return false; - for (auto& node : GraphTraits::DFS(graph)) { - for (const auto& pdnode : pattern_.nodes()) { + for (auto &node : GraphTraits::DFS(graph)) { + for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { VLOG(4) << "pdnode " << pdnode->name() << " marked"; pdnodes2nodes_[pdnode.get()].insert(&node); @@ -109,15 +109,15 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { } } // Check to early stop if some PDNode can't find matched Node. - for (auto& pdnode : pattern_.nodes()) { + for (auto &pdnode : pattern_.nodes()) { if (!pdnodes2nodes_.count(pdnode.get())) { VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; // return false; } } - for (auto& item : pdnodes2nodes_) { - for (auto& n : item.second) { - GetMarkedNodes(const_cast(&graph)).insert(n); + for (auto &item : pdnodes2nodes_) { + for (auto &n : item.second) { + GetMarkedNodes(const_cast(&graph)).insert(n); } } VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; @@ -128,28 +128,28 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { // The intermediate Nodes can only link to the nodes inside the pattern, or this // subgraph will be droped. void GraphPatternDetector::ValidateByNodeRole( - std::vector* subgraphs) { + std::vector *subgraphs) { std::vector result; subgraphs->erase( std::remove_if( subgraphs->begin(), subgraphs->end(), - [](const GraphPatternDetector::subgraph_t& subgraph) -> bool { + [](const GraphPatternDetector::subgraph_t &subgraph) -> bool { // Collect the inputs and outputs. - std::unordered_set ios; - for (auto& item : subgraph) { + std::unordered_set ios; + for (auto &item : subgraph) { if (!item.first->IsIntermediate()) { ios.insert(item.second); } } - for (auto& item : subgraph) { + for (auto &item : subgraph) { if (item.first->IsIntermediate()) { - for (auto* x : item.second->inputs) { + for (auto *x : item.second->inputs) { if (!ios.count(x)) { return true; } } - for (auto* x : item.second->outputs) { + for (auto *x : item.second->outputs) { if (!ios.count(x)) { return true; } @@ -162,9 +162,9 @@ void GraphPatternDetector::ValidateByNodeRole( } struct HitGroup { - std::unordered_map roles; + std::unordered_map roles; - bool Match(Node* node, PDNode* pat) { + bool Match(Node *node, PDNode *pat) { if (nodes_.count(node)) { if (!roles.count(pat)) return false; return roles[pat] == node; @@ -172,18 +172,18 @@ struct HitGroup { return !roles.count(pat) || roles.at(pat) == node; } - void Register(Node* node, PDNode* pat) { + void Register(Node *node, PDNode *pat) { roles[pat] = node; nodes_.insert(node); } private: - std::unordered_set nodes_; + std::unordered_set nodes_; }; // Tell whether Node a links to b. -bool IsNodesLink(Node* a, Node* b) { - for (auto* node : a->outputs) { +bool IsNodesLink(Node *a, Node *b) { + for (auto *node : a->outputs) { if (b == node) { return true; } @@ -198,10 +198,10 @@ GraphPatternDetector::DetectPatterns() { std::vector init_groups; std::array, 2> bi_records; // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); - auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() : pattern_.edges().front().first; if (!pdnodes2nodes_.count(first_pnode)) return result; - for (auto* node : pdnodes2nodes_[first_pnode]) { + for (auto *node : pdnodes2nodes_[first_pnode]) { HitGroup group; group.roles[first_pnode] = node; init_groups.emplace_back(group); @@ -212,21 +212,21 @@ GraphPatternDetector::DetectPatterns() { // Extend a PDNode to subgraphs by deducing the connection relations defined // in edges of PDNodes. - for (const auto& edge : pattern_.edges()) { + for (const auto &edge : pattern_.edges()) { VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); // TODO(Superjomn) Fix bug here, the groups might be duplicate here. // Each role has two PDNodes, which indicates two roles. // Detect two Nodes that can match these two roles and they are connected. - auto& pre_groups = bi_records[step % 2]; - auto& cur_groups = bi_records[1 - (step++ % 2)]; + auto &pre_groups = bi_records[step % 2]; + auto &cur_groups = bi_records[1 - (step++ % 2)]; cur_groups.clear(); if (pre_groups.empty()) break; // source -> target - for (Node* source : pdnodes2nodes_[edge.first]) { - for (Node* target : pdnodes2nodes_[edge.second]) { + for (Node *source : pdnodes2nodes_[edge.first]) { + for (Node *target : pdnodes2nodes_[edge.second]) { VLOG(8) << "check " << source->id() << " -- " << target->id(); // TODO(Superjomn) add some prune strategies. - for (const auto& group : pre_groups) { + for (const auto &group : pre_groups) { HitGroup new_group = group; if (IsNodesLink(source, target) && new_group.Match(source, edge.first)) { @@ -241,17 +241,17 @@ GraphPatternDetector::DetectPatterns() { } } VLOG(3) << "step " << step << " get records: " << cur_groups.size(); - for (auto& group : cur_groups) { - for (auto& item : group.roles) { + for (auto &group : cur_groups) { + for (auto &item : group.roles) { VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); } VLOG(4) << "========================================================="; } } - for (auto& group : bi_records[step % 2]) { + for (auto &group : bi_records[step % 2]) { GraphPatternDetector::subgraph_t subgraph; - for (auto& role : group.roles) { + for (auto &role : group.roles) { subgraph.emplace(role.first, role.second); } result.emplace_back(subgraph); @@ -260,16 +260,16 @@ GraphPatternDetector::DetectPatterns() { } void GraphPatternDetector::UniquePatterns( - std::vector* subgraphs) { + std::vector *subgraphs) { if (subgraphs->empty()) return; std::vector result; std::unordered_set set; - for (auto& g : *subgraphs) { + for (auto &g : *subgraphs) { size_t key = 0; - for (auto& item : g) { - key ^= std::hash{}(item.first); - key ^= std::hash{}(item.second); + for (auto &item : g) { + key ^= std::hash{}(item.first); + key ^= std::hash{}(item.second); } if (!set.count(key)) { result.emplace_back(g); @@ -280,20 +280,20 @@ void GraphPatternDetector::UniquePatterns( } void GraphPatternDetector::RemoveOverlappedMatch( - std::vector* subgraphs) { + std::vector *subgraphs) { std::vector result; - std::unordered_set node_set; + std::unordered_set node_set; - for (const auto& subgraph : *subgraphs) { + for (const auto &subgraph : *subgraphs) { bool valid = true; - for (auto& item : subgraph) { + for (auto &item : subgraph) { if (item.first->IsIntermediate() && node_set.count(item.second)) { valid = false; break; } } if (valid) { - for (auto& item : subgraph) { + for (auto &item : subgraph) { node_set.insert(item.second); } result.push_back(subgraph); @@ -307,71 +307,81 @@ std::string PDPattern::DotString() const { Dot dot; int id = 0; // Create Nodes - std::unordered_map node2dot; - for (const auto& node : nodes()) { + std::unordered_map node2dot; + for (const auto &node : nodes()) { std::string node_id = "Node" + std::to_string(id++); dot.AddNode(node_id, {}, node->name()); node2dot[node.get()] = node_id; } // Create Edges - for (const auto& edge : edges()) { + for (const auto &edge : edges()) { if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { LOG(ERROR) << "no node " << edge.first << " " << edge.second; continue; } - auto& src = node2dot.at(edge.first); - auto& trg = node2dot.at(edge.second); + auto &src = node2dot.at(edge.first); + auto &trg = node2dot.at(edge.second); dot.AddEdge(src, trg, {}); } return dot.Build(); } -PDNode& PDNode::LinksTo(const std::vector& others) { +PDNode &PDNode::LinksTo(const std::vector &others) { // extend outlinks. - for (PDNode* x : others) { + for (PDNode *x : others) { pattern_->AddEdge(this, x); } return *this; } -PDNode& PDNode::LinksFrom(const std::vector& others) { +PDNode &PDNode::LinksFrom(const std::vector &others) { // extend outlinks. - for (PDNode* x : others) { + for (PDNode *x : others) { pattern_->AddEdge(x, this); } return *this; } -PDNode* PDNode::assert_is_op() { - asserts_.emplace_back([](Node* x) { return x && x->IsOp(); }); +PDNode *PDNode::assert_is_op() { + asserts_.emplace_back([](Node *x) { return x && x->IsOp(); }); return this; } -PDNode* PDNode::assert_is_op(const std::string& op_type) { - asserts_.emplace_back([op_type](Node* x) { + +PDNode *PDNode::assert_is_op(const std::string &op_type) { + asserts_.emplace_back([op_type](Node *x) { return x && x->IsOp() && x->Op()->Type() == op_type; }); return this; } -PDNode* PDNode::assert_is_var() { - asserts_.emplace_back([](Node* x) { return x && x->IsVar(); }); + +PDNode *PDNode::assert_is_var() { + asserts_.emplace_back([](Node *x) { return x && x->IsVar(); }); + return this; +} + +PDNode *PDNode::assert_is_not_ctrl_var() { + asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); }); return this; } -PDNode* PDNode::assert_var_not_persistable() { + +PDNode *PDNode::assert_var_not_persistable() { assert_is_var(); - asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); }); + asserts_.emplace_back([](Node *x) { return !x->Var()->Persistable(); }); return this; } -PDNode* PDNode::assert_is_persistable_var() { + +PDNode *PDNode::assert_is_persistable_var() { assert_is_var(); - asserts_.emplace_back([=](Node* x) { return x->Var()->Persistable(); }); + asserts_.emplace_back([=](Node *x) { return x->Var()->Persistable(); }); return this; } -PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type, - const std::string& argument, int nth) { + +PDNode *PDNode::assert_is_op_nth_input(const std::string &op_type, + const std::string &argument, int nth) { assert_is_var(); assert_is_op_input(op_type); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->outputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { if (op->IsOp() && op->Op()->Type() == op_type && IsNthInput(x, op, argument, nth)) return true; @@ -380,11 +390,12 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type, }); return this; } -PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type, - const std::string& argument, int nth) { + +PDNode *PDNode::assert_is_op_nth_output(const std::string &op_type, + const std::string &argument, int nth) { assert_is_var(); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->inputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { if (op->IsOp() && op->Op()->Type() == op_type && IsNthOutput(x, op, argument, nth)) return true; @@ -393,10 +404,11 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type, }); return this; } -PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) { + +PDNode *PDNode::assert_is_only_input_of_op(const std::string &op_type) { assert_is_var(); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->outputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && op->inputs.size() == 1) { return true; @@ -406,10 +418,11 @@ PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) { }); return this; } -PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) { + +PDNode *PDNode::assert_is_only_output_of_op(const std::string &op_type) { assert_is_var(); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->inputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && op->outputs.size() == 1) { return true; @@ -419,10 +432,11 @@ PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) { }); return this; } -PDNode* PDNode::assert_is_op_output(const std::string& op_type) { + +PDNode *PDNode::assert_is_op_output(const std::string &op_type) { assert_is_var(); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->inputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { return true; } @@ -431,16 +445,17 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) { }); return this; } -PDNode* PDNode::assert_is_op_output(const std::string& op_type, - const std::string& argument) { + +PDNode *PDNode::assert_is_op_output(const std::string &op_type, + const std::string &argument) { assert_is_var(); assert_is_op_nth_output(op_type, argument, 0); return this; } -PDNode* PDNode::assert_is_op_input(const std::string& op_type) { +PDNode *PDNode::assert_is_op_input(const std::string &op_type) { assert_is_var(); - asserts_.emplace_back([=](Node* x) { - for (auto* op : x->outputs) { + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { return true; } @@ -449,72 +464,161 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) { }); return this; } -PDNode* PDNode::assert_is_op_input(const std::string& op_type, - const std::string& argument) { + +PDNode *PDNode::assert_is_op_input(const std::string &op_type, + const std::string &argument) { assert_is_var(); assert_is_op_nth_input(op_type, argument, 0); return this; } -PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) { + +PDNode *PDNode::assert_op_has_n_inputs(const std::string &op_type, size_t n) { assert_is_op(op_type); - asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; }); + asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; }); return this; } -PDNode* PDNode::assert_op_has_n_outputs(const std::string& op_type, size_t n) { + +PDNode *PDNode::assert_op_has_n_outputs(const std::string &op_type, size_t n) { assert_is_op(op_type); - asserts_.emplace_back([=](Node* x) { return x->outputs.size() == n; }); + asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; }); return this; } -PDNode* PDNode::assert_more(PDNode::teller_t&& teller) { + +PDNode *PDNode::assert_more(PDNode::teller_t &&teller) { asserts_.emplace_back(std::move(teller)); return this; } -bool VarLinksToOp(Node* node, const std::string& op_type) { - for (auto* out : node->outputs) { +PDNode *PDNode::assert_is_ops(const std::unordered_set &op_types) { + asserts_.emplace_back([op_types](Node *x) { + return x && x->IsOp() && op_types.count(x->Op()->Type()); + }); + return this; +} + +PDNode *PDNode::assert_is_ops_nth_input( + const std::unordered_set &op_types, + const std::string &argument, int nth) { + assert_is_var(); + assert_is_ops_input(op_types); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op->IsOp() && op_types.count(op->Op()->Type()) && + IsNthInput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_nth_output( + const std::unordered_set &op_types, + const std::string &argument, int nth) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op->IsOp() && op_types.count(op->Op()->Type()) && + IsNthOutput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} +PDNode *PDNode::assert_is_ops_output( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_output( + const std::unordered_set &op_types, + const std::string &argument) { + assert_is_var(); + assert_is_ops_nth_output(op_types, argument, 0); + return this; +} + +PDNode *PDNode::assert_is_ops_input( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_input( + const std::unordered_set &op_types, + const std::string &argument) { + assert_is_var(); + assert_is_ops_nth_input(op_types, argument, 0); + return this; +} + +bool VarLinksToOp(Node *node, const std::string &op_type) { + for (auto *out : node->outputs) { if (out->IsOp() && out->Op()->Type() == op_type) { return true; } } return false; } -bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) { + +bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); if (op->Op()->Input(argument).size() <= nth) return false; return var->Name() == op->Op()->Input(argument)[nth]; } -bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) { + +bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) { PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(op->IsOp()); if (op->Op()->Output(argument).size() <= nth) return false; return var->Name() == op->Op()->Output(argument)[nth]; } -void GraphSafeRemoveNodes(Graph* graph, - const std::unordered_set& nodes) { - for (auto* node : nodes) { - graph->RemoveNode(const_cast(node)); + +void GraphSafeRemoveNodes(Graph *graph, + const std::unordered_set &nodes) { + for (auto *node : nodes) { + graph->RemoveNode(const_cast(node)); } - for (auto* node : graph->Nodes()) { + for (auto *node : graph->Nodes()) { for (auto it = node->inputs.begin(); it != node->inputs.end();) { if (nodes.count(*it)) { - it = const_cast(node)->inputs.erase(it); + it = const_cast(node)->inputs.erase(it); } else { it++; } } for (auto it = node->outputs.begin(); it != node->outputs.end();) { if (nodes.count(*it)) { - it = const_cast(node)->outputs.erase(it); + it = const_cast(node)->outputs.erase(it); } else { it++; } } } } -bool VarLinksFromOp(Node* node, const std::string& op_type) { - for (auto* out : node->inputs) { + +bool VarLinksFromOp(Node *node, const std::string &op_type) { + for (auto *out : node->inputs) { if (out->IsOp() && out->Op()->Type() == op_type) { return true; } @@ -522,30 +626,30 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { return false; } -PDNode* patterns::ConvReLU::operator()( - paddle::framework::ir::PDNode* conv_input) { +PDNode *patterns::ConvReLU::operator()( + paddle::framework::ir::PDNode *conv_input) { // Create Operators conv_input->assert_is_op_input("conv2d", "Input"); - auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); - auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); // Create variables // Filter - auto* conv_weight_var = pattern->NewNode(conv_weight_repr()) + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) ->AsInput() ->assert_is_persistable_var() ->assert_is_op_input("conv2d", "Filter"); // Bias - auto* conv_bias_var = pattern->NewNode(conv_bias_repr()) + auto *conv_bias_var = pattern->NewNode(conv_bias_repr()) ->AsInput() ->assert_is_persistable_var() ->assert_is_op_input("conv2d", "Bias"); // intermediate variable, will be removed in the IR after fuse. - auto* conv_out_var = pattern->NewNode(conv_out_repr()) + auto *conv_out_var = pattern->NewNode(conv_out_repr()) ->AsIntermediate() ->assert_is_only_output_of_op("conv2d") ->assert_is_op_input("relu"); // output - auto* relu_out_var = pattern->NewNode(relu_out_repr()) + auto *relu_out_var = pattern->NewNode(relu_out_repr()) ->AsOutput() ->assert_is_op_output("relu"); @@ -555,18 +659,18 @@ PDNode* patterns::ConvReLU::operator()( return relu_out_var; } -PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, +PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, bool with_bias) { // Create shared nodes. x->assert_is_op_input("mul", "X"); - auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul"); + auto *mul = pattern->NewNode(mul_repr())->assert_is_op("mul"); - auto* mul_w_var = pattern->NewNode(w_repr()) + auto *mul_w_var = pattern->NewNode(w_repr()) ->AsInput() ->assert_is_persistable_var() ->assert_is_op_input("mul", "Y"); - auto* mul_out_var = + auto *mul_out_var = pattern->NewNode(mul_out_repr())->assert_is_op_output("mul"); if (!with_bias) { // not with bias @@ -577,14 +681,14 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, } else { // with bias mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); // Create operators. - auto* elementwise_add = pattern->NewNode(elementwise_add_repr()) + auto *elementwise_add = pattern->NewNode(elementwise_add_repr()) ->assert_is_op("elementwise_add"); // Create variables. - auto* bias = pattern->NewNode(bias_repr()) + auto *bias = pattern->NewNode(bias_repr()) ->assert_is_op_input("elementwise_add") ->AsInput(); - auto* fc_out = pattern->NewNode(Out_repr()) + auto *fc_out = pattern->NewNode(Out_repr()) ->AsOutput() ->assert_is_op_output("elementwise_add"); @@ -594,11 +698,11 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, } } -PDNode* patterns::LSTM::operator()(PDNode* x) { +PDNode *patterns::LSTM::operator()(PDNode *x) { x->assert_is_op_input("lstm", "Input"); - auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); + auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); #define NEW_NODE(arg__, io__) \ - auto* arg__ = \ + auto *arg__ = \ pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__); // Currently, the H0 and C0 are optional @@ -619,11 +723,11 @@ PDNode* patterns::LSTM::operator()(PDNode* x) { return Hidden; } -PDNode* patterns::GRU::operator()(PDNode* x) { +PDNode *patterns::GRU::operator()(PDNode *x) { x->assert_is_op_input("gru", "Input"); - auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru"); + auto *gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru"); #define NEW_NODE(arg__, io__) \ - auto* arg__ = \ + auto *arg__ = \ pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__); NEW_NODE(Weight, input); @@ -648,6 +752,100 @@ PDNode* patterns::GRU::operator()(PDNode* x) { return Hidden; } +PDNode *patterns::ActElewiseAdd::operator()( + paddle::framework::ir::PDNode *in_var, + std::unordered_set act_types) { + in_var->assert_is_ops_input(act_types, "X"); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_ops_output(act_types); + act_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto *ele_x_var = pattern->NewNode(ele_x_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_input("elementwise_add") + ->AsInput(); + auto *elementwise_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *elewise_add_out = pattern->NewNode(elewise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add", "Out"); + + act->LinksFrom({in_var}).LinksTo({act_out_var}); + elementwise_add->LinksFrom({act_out_var, ele_x_var}) + .LinksTo({elewise_add_out}); + + return elewise_add_out; +} + +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::ElewiseAddActInplaceGrad::operator()( + paddle::framework::ir::PDNode *d_act_out_var, + std::unordered_set act_types) { + // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] + // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] + auto *act_grad = pattern->NewNode(act_grad_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_input(act_types, "Out"); + + auto *d_intermediate_var = + pattern->NewNode(d_itermediate_out_repr()) + ->assert_is_ops_output(act_types, GradVarName("X")); + + act_grad->LinksFrom({d_act_out_var, act_out_var}) + .LinksTo({d_intermediate_var}); + + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_input("elementwise_add_grad", "Y"); + + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + + auto *d_ele_x_var = + pattern->NewNode(d_ele_x_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + + auto *d_ele_y_var = + pattern->NewNode(d_ele_y_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + + ele_add_grad->LinksFrom({d_intermediate_var, ele_y_var}) + .LinksTo({d_ele_x_var, d_ele_y_var}); + + return ele_add_grad; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 1a8d9cefbfa570d2ac3f4fc32d50d705ddc67a75..46950ed877cda7cd83ead4e9aa9a3aaae5d5ecfa 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -95,6 +95,7 @@ struct PDNode { PDNode* assert_is_op(); PDNode* assert_is_op(const std::string& op_type); PDNode* assert_is_var(); + PDNode* assert_is_not_ctrl_var(); PDNode* assert_var_not_persistable(); PDNode* assert_is_persistable_var(); PDNode* assert_is_op_output(const std::string& op_type); @@ -113,6 +114,20 @@ struct PDNode { PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n); PDNode* assert_more(teller_t&& teller); + PDNode* assert_is_ops_output(const std::unordered_set& op_types); + PDNode* assert_is_ops(const std::unordered_set& op_types); + PDNode* assert_is_ops_output(const std::unordered_set& op_types, + const std::string& argument); + PDNode* assert_is_ops_nth_input( + const std::unordered_set& op_types, + const std::string& argument, int nth); + PDNode* assert_is_ops_input(const std::unordered_set& op_types); + PDNode* assert_is_ops_input(const std::unordered_set& op_types, + const std::string& argument); + PDNode* assert_is_ops_nth_output( + const std::unordered_set& op_types, + const std::string& argument, int nth); + private: PDNode(PDPattern* pattern, const std::string& name = "", Type type = Type::kVar) @@ -447,6 +462,68 @@ struct GRU : public PatternBase { PATTERN_DECL_NODE(Hidden); }; +// The following patterns are used to fuse elewise_add and act +// formula: act(ele_add(x, y)) +// op: elementwise_add + act +// named nodes: elementwise_add, act +// ele_x, ele_y, elewise_add_out, act_out +struct ElewiseAddAct : public PatternBase { + ElewiseAddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewise_add_act") {} + + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + // declare variable node's name + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_y); + PATTERN_DECL_NODE(act_out); +}; + +// formula: ele_add(x, act(y)) +// op: elementwise_add + act +// named nodes: elementwise_add, act +// act_in, act_out, ele_x, elewise_add_out +struct ActElewiseAdd : public PatternBase { + ActElewiseAdd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "act_elewise_add") {} + + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(ele_add); + // declare variable node's name + PATTERN_DECL_NODE(act_out); + PATTERN_DECL_NODE(ele_x); + PATTERN_DECL_NODE(elewise_add_out); +}; + +// the backward of act(ele_add(x, y)) +// the act is inplace. +// op: elementwise_add_grad + act_grad +// named nodes: elementwise_add_grad, act_grad +// act_out, act_out_g, ele_y, d_itermediate_out, d_ele_x, d_ele_y +struct ElewiseAddActInplaceGrad : public PatternBase { + ElewiseAddActInplaceGrad(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewise_add_act_grad1") {} + + // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] + // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(act_grad); + PATTERN_DECL_NODE(ele_add_grad); + // declare variable node's name + PATTERN_DECL_NODE(act_out); + PATTERN_DECL_NODE(d_itermediate_out); + PATTERN_DECL_NODE(d_ele_x); + PATTERN_DECL_NODE(d_ele_y); + PATTERN_DECL_NODE(ele_y); +}; } // namespace patterns // Link two ir::Nodes from each other. @@ -454,6 +531,12 @@ struct GRU : public PatternBase { a->outputs.push_back(b); \ b->inputs.push_back(a); +// Set the out_var as the output of the op +#define IR_OP_VAR_LINK(op, out_var) \ + op->outputs.push_back(out_var); \ + out_var->inputs.clear(); \ + out_var->inputs.push_back(op); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 2817fcf5320f00affdcba097681c7ab20f0eb227..9277abe8c1b79c5f76f4610d0554bf337f329518 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -19,6 +19,11 @@ namespace framework { namespace ir { constexpr char Node::kControlDepVarName[]; int Node::count_ = 0; + +std::unique_ptr CreateNodeForTest(const std::string& name, + Node::Type type) { + return std::unique_ptr(new Node(name, type)); +} } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d53d789d3ad27b8f9606a396264d91e5f07a9d10..5d6da9f1d76a3c0fc64b7ff35264e385cf19a14b 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -24,32 +24,12 @@ namespace paddle { namespace framework { namespace ir { +// Node should normally created by Graph::CreateXXXNode(). class Node { public: enum class Type { kOperation, kVariable }; static constexpr char kControlDepVarName[] = "__control_var"; - explicit Node(const std::string& name, Type type) - : name_(name), - var_desc_(nullptr), - op_desc_(nullptr), - type_(type), - id_(count_++) {} - - explicit Node(VarDesc* var_desc) - : name_(var_desc->Name()), - var_desc_(new VarDesc(*var_desc)), - op_desc_(nullptr), - type_(Type::kVariable), - id_(count_++) {} - - explicit Node(OpDesc* op_desc) - : name_(op_desc->Type()), - var_desc_(nullptr), - op_desc_(new OpDesc(*op_desc, op_desc->Block())), - type_(Type::kOperation), - id_(count_++) {} - Type NodeType() const { return type_; } std::string Name() const { return name_; } @@ -68,6 +48,10 @@ class Node { bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } + bool IsCtrlVar() const { + return type_ == Type::kVariable && + Name().find(ir::Node::kControlDepVarName) != std::string::npos; + } std::vector inputs; std::vector outputs; @@ -81,11 +65,40 @@ class Node { private: friend class Graph; + friend std::unique_ptr CreateNodeForTest(const std::string& name, + Node::Type type); + + explicit Node(const std::string& name, Type type) + : name_(name), + var_desc_(nullptr), + op_desc_(nullptr), + type_(type), + id_(count_++) {} + + explicit Node(VarDesc* var_desc) + : name_(var_desc->Name()), + var_desc_(new VarDesc(*var_desc)), + op_desc_(nullptr), + type_(Type::kVariable), + id_(count_++) {} + + explicit Node(OpDesc* op_desc) + : name_(op_desc->Type()), + var_desc_(nullptr), + op_desc_(new OpDesc(*op_desc, op_desc->Block())), + type_(Type::kOperation), + id_(count_++) {} + + Node() = delete; + static int count_; static void ResetId() { count_ = 0; } DISABLE_COPY_AND_ASSIGN(Node); }; +std::unique_ptr CreateNodeForTest(const std::string& name, + Node::Type type); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 7836ecb1272a07a79a70c9cb040335f9a42e5684..ba2c41eb8968e30bc8a801f4d8d239da8c522de9 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,10 +17,12 @@ #include #include #include +#include #include - +#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -28,206 +30,401 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) +namespace details { +struct CUDABuffer { + void *data_{nullptr}; + size_t size_{0}; + platform::CUDAPlace place_; + + CUDABuffer() {} + CUDABuffer(platform::Place place, size_t size) + : size_(size), place_(boost::get(place)) { + data_ = memory::Alloc(place_, size); + } + + ~CUDABuffer() { ClearMemory(); } + + CUDABuffer(const CUDABuffer &o) = delete; + CUDABuffer &operator=(const CUDABuffer &o) = delete; + + void Resize(platform::Place place, size_t size) { + ClearMemory(); + place_ = boost::get(place); + data_ = memory::Alloc(place_, size); + size_ = size; + } + + void Swap(CUDABuffer &o) { + std::swap(data_, o.data_); + std::swap(place_, o.place_); + std::swap(size_, o.size_); + } + + private: + void ClearMemory() const { + if (data_) { + memory::Free(place_, data_); + } + } +}; +} // namespace details + // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template class Vector { public: using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; - // Default ctor. Create empty Vector - Vector() { InitEmpty(); } + private: + // The actual class to implement vector logic + class VectorData { + public: + VectorData() : flag_(kDataInCPU) {} + VectorData(size_t count, const T &value) + : cpu_(count, value), flag_(kDataInCPU) {} + VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} + template + explicit VectorData(const std::vector &dat) + : cpu_(dat), flag_(kDataInCPU) {} + + VectorData(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + } - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) { - InitEmpty(); - if (count != 0) { - resize(count); - T *ptr = begin(); - for (size_t i = 0; i < count; ++i) { - ptr[i] = value; + VectorData &operator=(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + details::CUDABuffer null; + gpu_.Swap(null); + return *this; + } + + T &operator[](size_t i) { + MutableCPU(); + return cpu_[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_[i]; + } + + size_t size() const { return cpu_.size(); } + + iterator begin() { + MutableCPU(); + return cpu_.begin(); + } + + iterator end() { + MutableCPU(); + return cpu_.end(); + } + + T &front() { + MutableCPU(); + return cpu_.front(); + } + + T &back() { + MutableCPU(); + return cpu_.back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return cpu_.begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return cpu_.end(); + } + + const T &back() const { + ImmutableCPU(); + return cpu_.back(); + } + + T *data() { return &(*this)[0]; } + + const T *data() const { return &(*this)[0]; } + + const T &front() const { + ImmutableCPU(); + return cpu_.front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + cpu_.assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + cpu_.push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(this->cpu_); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + cpu_.resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return reinterpret_cast(gpu_.data_); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + cpu_.clear(); + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { return cpu_.capacity(); } + + // reserve data + void reserve(size_t size) { cpu_.reserve(size); } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + ImmutableCPU(); + return cpu_; + } + + bool operator==(const VectorData &other) const { + ImmutableCPU(); + other.ImmutableCPU(); + return cpu_ == other.cpu_; + } + + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + void *src = gpu_.data_; + void *dst = cpu_.data(); + memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, + nullptr); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); } + flag_ = kDirty | kDataInCPU; } - } - // Ctor with init_list - Vector(std::initializer_list init) { - if (init.size() == 0) { - InitEmpty(); - } else { - InitByIter(init.size(), init.begin(), init.end()); + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && + !(boost::get(place) == gpu_.place_)) { + CopyCUDADataToAnotherPlace(place); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(boost::get(place) == gpu_.place_)) { + CopyCUDADataToAnotherPlace(place); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } } - } + void CopyCUDADataToAnotherPlace(const platform::Place &place) const { + details::CUDABuffer tmp(place, gpu_.size_); + const void *src = gpu_.data_; + void *dst = tmp.data_; + + memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr); + gpu_.Swap(tmp); + } + void CopyCPUDataToCUDA(const platform::Place &place) const { + void *src = cpu_.data(); + gpu_.Resize(place, cpu_.size() * sizeof(T)); + void *dst = gpu_.data_; + auto stream = static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, + stream); + } + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + mutable std::vector cpu_; + mutable details::CUDABuffer gpu_; + mutable int flag_; + }; + + public: + // Default ctor. Create empty Vector + Vector() : m_(new VectorData()) {} + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T &value = T()) + : m_(new VectorData(count, value)) {} + + // Ctor with init_list + Vector(std::initializer_list init) : m_(new VectorData(init)) {} // implicit cast from std::vector. template - Vector(const std::vector &dat) { // NOLINT - if (dat.size() == 0) { - InitEmpty(); - } else { - InitByIter(dat.size(), dat.begin(), dat.end()); - } + Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT } // Copy ctor - Vector(const Vector &other) { this->operator=(other); } + Vector(const Vector &other) { m_ = other.m_; } // Copy operator Vector &operator=(const Vector &other) { - if (other.size() != 0) { - this->InitByIter(other.size(), other.begin(), other.end()); - } else { - InitEmpty(); - } + m_ = other.m_; return *this; } // Move ctor - Vector(Vector &&other) { - this->size_ = other.size_; - this->flag_ = other.flag_; - if (other.cuda_vec_.memory_size()) { - this->cuda_vec_.ShareDataWith(other.cuda_vec_); - } - if (other.cpu_vec_.memory_size()) { - this->cpu_vec_.ShareDataWith(other.cpu_vec_); - } - } + Vector(Vector &&other) { m_ = std::move(other.m_); } // CPU data access method. Mutable. - T &operator[](size_t i) { - MutableCPU(); - return const_cast(cpu_vec_.data())[i]; - } + T &operator[](size_t i) { return (*m_)[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { - ImmutableCPU(); - return cpu_vec_.data()[i]; - } + const T &operator[](size_t i) const { return (*m_)[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return size_; } + size_t size() const { return m_->size(); } - T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } + iterator begin() { return m_->begin(); } - T *end() { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + iterator end() { return m_->end(); } - T &front() { return *begin(); } + T &front() { return m_->front(); } - T &back() { - auto it = end(); - --it; - return *it; - } + T &back() { return m_->back(); } - const T *begin() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); - } + const_iterator begin() const { return m_->begin(); } - const T *end() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + const_iterator end() const { return m_->end(); } - const T *cbegin() const { return begin(); } + const_iterator cbegin() const { return begin(); } - const T *cend() const { return end(); } + const_iterator cend() const { return end(); } - const T &back() const { - auto it = end(); - --it; - return *it; - } + const T &back() const { return m_->back(); } - T *data() { return begin(); } + T *data() { return m_->data(); } - const T *data() const { return begin(); } + const T *data() const { return m_->data(); } - const T &front() const { return *begin(); } + const T &front() const { return m_->front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - InitByIter(end - begin, begin, end); + m_->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { - if (size_ + 1 > capacity()) { - reserve((size_ + 1) << 1); - } - *end() = elem; - ++size_; - } + void push_back(T elem) { m_->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - size_t pre_size = size_; - resize(pre_size + (end - begin)); - T *ptr = this->begin() + pre_size; - for (; begin < end; ++begin, ++ptr) { - *ptr = *begin; - } + m_->Extend(begin, end); } // resize the vector void resize(size_t size) { - if (size + 1 <= capacity()) { - size_ = size; - } else { - MutableCPU(); - Tensor cpu_tensor; - platform::Place cpu = platform::CPUPlace(); - T *ptr = cpu_tensor.mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - const T *old_ptr = - cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + size_, ptr); - } - size_ = size; - cpu_vec_.ShareDataWith(cpu_tensor); + if (m_.Data().size() != size) { + m_->resize(size); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "CUDA Data must on CUDA place"); - ImmutableCUDA(place); - return cuda_vec_.data(); + return m_.Data().CUDAData(place); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - const T *ptr = CUDAData(place); - flag_ = kDirty | kDataInCUDA; - return const_cast(ptr); + return m_->CUDAMutableData(place); } // clear - void clear() { - size_ = 0; - flag_ = kDirty | kDataInCPU; - } + void clear() { m_->clear(); } - size_t capacity() const { - return cpu_vec_.memory_size() / SizeOfType(typeid(T)); - } + size_t capacity() const { return m_->capacity(); } // reserve data - void reserve(size_t size) { - size_t pre_size = size_; - resize(size); - resize(pre_size); - } + void reserve(size_t size) { m_->reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -248,12 +445,7 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - std::vector result; - result.resize(size()); - std::copy(begin(), end(), result.begin()); - return result; - } + operator std::vector() const { return *m_; } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -267,118 +459,11 @@ class Vector { return true; } - private: - void InitEmpty() { - size_ = 0; - flag_ = kDataInCPU; - } - - template - void InitByIter(size_t size, Iter begin, Iter end) { - platform::Place cpu = platform::CPUPlace(); - T *ptr = this->cpu_vec_.template mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - for (size_t i = 0; i < size; ++i) { - *ptr++ = *begin++; - } - flag_ = kDataInCPU | kDirty; - size_ = size; - } - - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void CopyToCPU() const { - // COPY GPU Data To CPU - TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); - WaitPlace(cuda_vec_.place()); - } - - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - - void ImmutableCUDA(platform::Place place) const { - if (IsDirty()) { - if (IsInCPU()) { - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - UnsetFlag(kDirty); - SetFlag(kDataInCUDA); - } else if (IsInCUDA() && !(place == cuda_vec_.place())) { - framework::Tensor tmp; - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - cuda_vec_.ShareDataWith(tmp); - // Still dirty - } else { - // Dirty && DataInCUDA && Device is same - // Do nothing - } - } else { - if (!IsInCUDA()) { - // Even data is not dirty. However, data is not in CUDA. Copy data. - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - SetFlag(kDataInCUDA); - } else if (!(place == cuda_vec_.place())) { - framework::Tensor tmp; - WaitPlace(cuda_vec_.place()); - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - WaitPlace(place); - cuda_vec_.ShareDataWith(tmp); - } else { - // Not Dirty && DataInCUDA && Device is same - // Do nothing. - } - } - } - - void ImmutableCPU() const { - if (IsDirty() && - !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. - CopyToCPU(); - UnsetFlag(kDirty); - } - SetFlag(kDataInCPU); - } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } + const void *Handle() const { return &m_.Data(); } - bool IsDirty() const { return flag_ & kDirty; } - - bool IsInCUDA() const { return flag_ & kDataInCUDA; } - - bool IsInCPU() const { return flag_ & kDataInCPU; } - - static void WaitPlace(const platform::Place place) { - if (platform::is_gpu_place(place)) { - platform::DeviceContextPool::Instance() - .Get(boost::get(place)) - ->Wait(); - } - } - - static T &EmptyDummy() { - static T dummy = T(); - return dummy; - } - - mutable int flag_; - mutable Tensor cpu_vec_; - mutable Tensor cuda_vec_; - size_t size_; + private: + // Vector is an COW object. + details::COWPtr m_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 555faba9624b9c76a9efdf4a62cd319f9682566e..86f6147cf7ac1e82ac2904bbcdcf9697422560ce 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -441,7 +441,10 @@ static void InitInferShapeFuncs() { for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { auto op_type = kern_pair.first; - auto &op_info = info_map.at(op_type); + auto it = info_map.find(op_type); + PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered", + op_type); + auto &op_info = it->second; auto op = static_cast(op_info.Creator()( "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); if (op_info.infer_shape_) { // infer_shape has been registered. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ae393d66a3b3ec0141667b44b5d9f3158e434e37..f5a54c0f48c98512dcb393d63a61f3c927e7ac1f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -57,6 +57,21 @@ std::unique_ptr ApplyParallelExecutorPass( graph = viz_pass->Apply(std::move(graph)); } + // Apply op fusion. + if (strategy.fuse_elewise_add_act_ops_) { + auto fuse_elewise_add_act_pass = + ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass"); + graph = fuse_elewise_add_act_pass->Apply(std::move(graph)); + // Apply a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); + } + } + // Convert graph to run on multi-devices. auto multi_devices_pass = ir::PassRegistry::Instance().Get("multi_devices_pass"); @@ -233,30 +248,9 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToDevices( const std::unordered_set &vars) const { - // the initializing bcast, all vars would be bcast from device(0), - // otherwise - // bcast from the specified device. - bool initializing = member_->executor_ ? false : true; + // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { - int var_dev_id = -1; - if (member_->executor_) { - auto &sharded_var_device = - member_->executor_->Graph().Get( - details::kShardedVarDevice); - if (sharded_var_device.find(var) != sharded_var_device.end()) { - var_dev_id = sharded_var_device.at(var); - } - } - - if (!initializing && var_dev_id == -1) continue; - - framework::Variable *main_var = nullptr; - if (initializing) { - main_var = member_->local_scopes_[0]->FindVar(var); - } else { - main_var = member_->local_scopes_[var_dev_id]->FindVar(var); - } - + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); if (main_var == nullptr || !main_var->IsType()) { continue; } @@ -272,8 +266,7 @@ void ParallelExecutor::BCastParamsToDevices( auto place = member_->places_[i]; void *buffer; - if ((initializing && i == 0) || - (!initializing && static_cast(i) == var_dev_id)) { + if (i == 0) { buffer = const_cast(main_tensor.data()); } else { auto local_scope = member_->local_scopes_[i]; @@ -290,29 +283,18 @@ void ParallelExecutor::BCastParamsToDevices( platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); - if (initializing) { - platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); - } else { - if (var_dev_id >= 0) { - platform::dynload::ncclBcast(buffers[i], numel, data_type, - var_dev_id, nccl_ctx.comm_, - nccl_ctx.stream()); - } - } + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); } member_->nccl_ctxs_->WaitAll(); } - #else PADDLE_THROW("Not compiled with CUDA"); #endif } else { platform::CPUPlace cpu; for (size_t i = 0; i < member_->places_.size(); ++i) { - if ((initializing && i == 0) || - (!initializing && static_cast(i) == var_dev_id)) - continue; + if (i == 0) continue; auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); @@ -392,6 +374,7 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 88e2078454024c3a4d437615d3e6b15ee0c7d6a1..c64906ff230df5f2b7cc9f5c6b29d68956ab8f33 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -72,9 +72,9 @@ class ParallelExecutor { void Run(const std::vector &fetch_tensors, const std::string &fetched_var_name); + private: void BCastParamsToDevices(const std::unordered_set &vars) const; - private: ParallelExecutorPrivate *member_; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc index 57c1b822d8d4f095f33cba2bfd5210f7ee19dd9f..0afcd85fe7c2e6806eb67797300e2731977573fb 100644 --- a/paddle/fluid/framework/prune.cc +++ b/paddle/fluid/framework/prune.cc @@ -183,28 +183,5 @@ void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { output->clear_blocks(); prune_impl(input, output, 0, -1, &dependent_vars); } - -void inference_optimize_impl(proto::ProgramDesc* input, int block_id) { - auto* op_field = input->mutable_blocks(block_id)->mutable_ops(); - for (auto& op_desc : *op_field) { - for (auto& attr : *op_desc.mutable_attrs()) { - if (attr.name() == "is_test") { - attr.set_b(true); - break; - } - } - } -} - -void InferenceOptimize(const proto::ProgramDesc& input, - proto::ProgramDesc* output) { - *output = input; - int num_blocks = output->blocks_size(); - PADDLE_ENFORCE_GT(num_blocks, 0, "ProgramDesc must have at least one block"); - for (int i = 0; i < num_blocks; ++i) { - inference_optimize_impl(output, i); - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h index 4c5a1dedd9950553e93dc681a40af179bdb8a30d..1be7cd25d099a18e6b7e2afaf34b1632f881b823 100644 --- a/paddle/fluid/framework/prune.h +++ b/paddle/fluid/framework/prune.h @@ -22,8 +22,5 @@ namespace framework { void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output); -void InferenceOptimize(const proto::ProgramDesc& input, - proto::ProgramDesc* output); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ccb7fa1f8cce8cc757038904bce762af3b5ff30b..9c67df7bdfb2c4e5d1c9fe60676c412ab11b4fa5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -252,12 +252,12 @@ endif() op_library(cross_entropy_op DEPS cross_entropy) if(WITH_GPU) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub) + op_library(sequence_softmax_op DEPS cub) else() op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) endif() op_library(softmax_op DEPS softmax) -op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 149226e92d4d08a25c211bce686ff03c5d7ddf40..b8b8b2290a0f002fd379032e28590b84a1da38e9 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/framework/lod_rank_table.h" @@ -24,6 +25,50 @@ namespace operators { using LoD = framework::LoD; +struct ArrayToLoDFunctor; +template +struct ArrayToLoDFunctorImpl { + const ArrayToLoDFunctor *prev_functor_; + DeviceContext *dev_ctx_; + + template + void apply(); +}; + +struct ArrayToLoDFunctor : public boost::static_visitor { + std::vector in; + mutable framework::Tensor *out; + + template + void operator()(Place place) const { + auto &pool = platform::DeviceContextPool::Instance(); + if (std::is_same::value) { + Apply(static_cast(pool.Get(place))); + } else { +#ifdef PADDLE_WITH_CUDA + Apply(static_cast(pool.Get(place))); +#else + PADDLE_THROW("Fluid is not compiled with CUDA"); +#endif + } + } + + template + void Apply(DeviceContext *dev_ctx) const { + ArrayToLoDFunctorImpl functor; + functor.dev_ctx_ = dev_ctx; + functor.prev_functor_ = this; + framework::VisitDataType(framework::ToDataType(out->type()), functor); + } +}; + +template +template +void ArrayToLoDFunctorImpl::apply() { + math::ConcatFunctor func; + func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out); +} + class ArrayToLoDTensorOp : public framework::OperatorBase { public: ArrayToLoDTensorOp(const std::string &type, @@ -47,14 +92,18 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { int rank = x[0].dims().size(); platform::Place place = x[0].place(); std::type_index data_type = x[0].type(); - framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank); int64_t batch_size = x[0].dims()[0]; + framework::DDim ins_dims = rank > 1 + ? framework::slice_ddim(x[0].dims(), 1, rank) + : framework::make_ddim({0}); for (size_t i = 1; i < x.size(); ++i) { - PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims, + auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank) + : framework::make_ddim({0}); + PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims, "The dimension of the %zu'th element in LoDTensorArray " "differs from previous ones.", i); - PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place), + PADDLE_ENFORCE(x[i].place() == place, "The place class of the %zu'th element in LoDTensorArray " "differs from previous ones.", i); @@ -82,13 +131,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { // Build LoDTensor `out` framework::LoD *out_lod = out->mutable_lod(); out_lod->clear(); - size_t out_offset = 0; auto prefix_lod = rank_table.coarse_lod(); prefix_lod.emplace_back(); auto &cur_level_lod = prefix_lod.back(); cur_level_lod.push_back(0); + ArrayToLoDFunctor functor; for (size_t idx : table_item_idx) { cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length); + PADDLE_ENFORCE_LE(table_items[idx].length, x.size()); for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) { auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( x[x_idx].lod(), idx, idx + 1, 0); @@ -106,17 +156,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { if (len == 0) { continue; } - auto slice = out->Slice(out_offset, out_offset + len); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place, - dev_ctx, &slice); - out_offset += len; + functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset)); } } + functor.out = out; + platform::VisitPlace(place, functor); out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end()); } }; diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index c72405593788493e10a1293b0c722e2d11c6e312..bc58612f9d3a2b433f362787135b6bb23b203f63 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -95,6 +95,7 @@ class ConcatOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); } }; diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 78be2e1e1f06c7a518e35a770c1dc9581b2d10fe..b2c6495c442cd02679825425becc2160c303dcc6 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -109,8 +109,9 @@ class ConcatGradKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); paddle::operators::math::ConcatGradFunctor concat_grad_functor; - concat_grad_functor(dev_ctx, *out_grad, ins, static_cast(axis), - &outputs); + concat_grad_functor(dev_ctx, *out_grad, + ctx.MultiInput("X"), + static_cast(axis), &outputs); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index eeb98ee44f206dbfbe1f61689aa9843122ae3f92..a916dd3496ffaffa138529a8a2f7e20ef26fcc96 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -29,6 +29,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { auto in_dims = ctx->GetInputDim("Input"); auto filter_dims = ctx->GetInputDim("Filter"); + std::vector output_size = + ctx->Attrs().Get>("output_size"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); std::vector dilations = ctx->Attrs().Get>("dilations"); @@ -42,6 +44,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U, "ConvTransposeOp input dimension and strides dimension should " "be consistent."); + if (output_size.size()) + PADDLE_ENFORCE_EQ(output_size.size(), strides.size(), + "ConvTransposeOp output_size dimension and strides " + "dimension should be the same."); PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), "ConvTransposeOp paddings dimension and strides " "dimension should be the same."); @@ -55,8 +61,17 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { std::vector output_shape({in_dims[0], filter_dims[1] * groups}); for (size_t i = 0; i < strides.size(); ++i) { auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; - output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + - filter_extent); + auto infer_shape = + (in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_extent; + if (output_size.size()) { + PADDLE_ENFORCE((output_size[i] >= infer_shape && + output_size[i] < infer_shape + strides[i]), + "ConvTransposeOp output_size should be " + "in appropriate range."); + output_shape.push_back(output_size[i]); + } else { + output_shape.push_back(infer_shape); + } } ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); } @@ -103,6 +118,10 @@ void Conv2DTransposeOpMaker::Make() { AddOutput("Output", "(Tensor) The output tensor of convolution transpose operator. " "The format of output tensor is also NCHW."); + AddAttr>("output_size", + "(vector default: []), the " + "size of the output tensor") + .SetDefault({}); AddAttr("groups", "(int default:1), the groups number of the convolution " "transpose operator. ") @@ -192,7 +211,10 @@ void Conv3DTransposeOpMaker::Make() { "Where N is batch size, C is " "the number of channels, D is the depth of the feature, H is the " "height of the feature, and W is the width of the feature."); - + AddAttr>("output_size", + "(vector default: []), the " + "size of the output tensor") + .SetDefault({}); AddAttr>( "dilations", "(vector default:{1, 1, 1}), the " @@ -247,7 +269,7 @@ Parameters(strides, paddings) are three elements. These three elements represent depth, height and width, respectively. The input(X) size and output(Out) size may be different. -Example: +Example: Input: Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 03974a7fc511b1e1cb5b0eca532b260fdf9bf964..f123e11542d85c904a81fe2a87f59ab52511cc15 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -86,10 +86,10 @@ class XeGradFunctor { auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; for (size_t x_offset = sample_id * num_classes_; x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = - (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_) - ? static_cast(0) - : -dy_[sample_id] / x_[x_offset]; + dx_[x_offset] = (x_offset != x_is_true_offset || + label_[sample_id] == static_cast(ignore_index_)) + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; } } diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index 48bdce740878ea486eda6821dc29885a3e480114..a800d5df0a7cbc668a0217350098bce2bfdcfa70 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -24,10 +24,22 @@ namespace detail { * and passed by `args` */ template -inline T &Ref(T *ptr, ARGS &&... args) { +inline T& Ref(T* ptr, ARGS&&... args) { PADDLE_ENFORCE(ptr != nullptr, args...); return *ptr; } + +template +inline std::vector> VectorRef( + const std::vector& vec, ARGS&&... args) { + std::vector> result; + result.reserve(vec.size()); + for (auto* ptr : vec) { + result.emplace_back(Ref(ptr, args...)); + } + return result; +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be..dd5d138a1e979826d59c4731920379b030e3b492 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto ap_type = GetAPType(ctx.Attr("ap_type")); int class_num = ctx.Attr("class_num"); - auto label_lod = in_label->lod(); - auto detect_lod = in_detect->lod(); + auto& label_lod = in_label->lod(); + auto& detect_lod = in_detect->lod(); PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), @@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto labels = framework::EigenTensor::From(input_label); auto detect = framework::EigenTensor::From(input_detect); - auto label_lod = input_label.lod(); - auto detect_lod = input_detect.lod(); + auto& label_lod = input_label.lod(); + auto& detect_lod = input_detect.lod(); int batch_size = label_lod[0].size() - 1; - auto label_index = label_lod[0]; + auto& label_index = label_lod[0]; for (int n = 0; n < batch_size; ++n) { std::map> boxes; @@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { output_true_pos->set_lod(true_pos_lod); output_false_pos->set_lod(false_pos_lod); - return; } void GetInputPos(const framework::Tensor& input_pos_count, @@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod()[0]; + auto& pos_data_lod = pos_tensor.lod()[0]; for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; @@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>* false_pos) const { int batch_size = gt_boxes.size(); for (int n = 0; n < batch_size; ++n) { - auto image_gt_boxes = gt_boxes[n]; - for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { + auto& image_gt_boxes = gt_boxes[n]; + for (auto& image_gt_box : image_gt_boxes) { size_t count = 0; - auto labeled_bboxes = it->second; + auto& labeled_bboxes = image_gt_box.second; if (evaluate_difficult) { count = labeled_bboxes.size(); } else { - for (size_t i = 0; i < labeled_bboxes.size(); ++i) - if (!(labeled_bboxes[i].is_difficult)) ++count; + for (auto& box : labeled_bboxes) { + if (!box.is_difficult) { + ++count; + } + } } if (count == 0) { continue; } - int label = it->first; + int label = image_gt_box.first; if (label_pos_count->find(label) == label_pos_count->end()) { (*label_pos_count)[label] = count; } else { diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc index 7cd67e74de6b9c4fbc718f60b4f671ccab2f9956..86a8459a79135d1fbcba6886172acc5a2abdb88b 100644 --- a/paddle/fluid/operators/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise_mul_op.cc @@ -13,9 +13,45 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise_mul_op.h" +#include #include "paddle/fluid/operators/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseMulOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_mul_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + return op; + } +}; + +class ElementwiseMulOpMaker : public ElementwiseOpMaker { + protected: + virtual std::string GetName() const { return "Mul"; } + virtual std::string GetEquation() const { return "Out = X \\\\odot Y"; } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y"); +REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp, + ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwiseMulOpGradDescMaker); +REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_mul, ops::ElementwiseMulKernel, diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h index 4437da4d95f97b5cbbca1650badf9710c26b4380..b870d08a1a28fd3e678aeb7211f7e3ec8b2c4c65 100644 --- a/paddle/fluid/operators/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -93,8 +93,8 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = dout; // out is not necessary auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index b1a399c22c2b9ed7464a1b1764478803d4416d94..7c84a9d813948ab7347446872643c2e00823a5ad 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -987,18 +987,28 @@ void FusedElemwiseAndActComputeWithBroadcast( } // --- backward -template +template struct FusedElemwiseAndActGradNoBroadcast { HOSTDEVICE void operator()(size_t i) { if (dx_ != nullptr) { - dx_[i] = UseIntermediateOut ? dx_op_(x_[i], y_[i], intermediate_out_[i], - out_[i], dout_[i]) - : dx_op_(x_[i], y_[i], out_[i], dout_[i]); + dx_[i] = UseIntermediateOut + ? dx_op_.UseIntermediateOut( + x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i]) + : dx_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]); } if (dy_ != nullptr) { - dy_[i] = UseIntermediateOut ? dy_op_(x_[i], y_[i], intermediate_out_[i], - out_[i], dout_[i]) - : dy_op_(x_[i], y_[i], out_[i], dout_[i]); + dy_[i] = UseIntermediateOut + ? dy_op_.UseIntermediateOut( + x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i]) + : dy_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]); + } + if (dintermediate_ != nullptr) { + dintermediate_[i] = + UseIntermediateOut + ? dintermediate_op_.UseIntermediateOut( + x_[i], intermediate_out_[i], out_[i], dout_[i]) + : dintermediate_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]); } } @@ -1009,37 +1019,44 @@ struct FusedElemwiseAndActGradNoBroadcast { const T *dout_; DX_OP dx_op_; DY_OP dy_op_; + DIntermediate_OP dintermediate_op_; T *dx_; T *dy_; + T *dintermediate_; }; template + typename DIntermediate_OP, bool UseIntermediateOut> void FusedElemwiseAndActGradComputeNoBroadcast( const framework::ExecutionContext &ctx, const framework::DDim &x_dim, const framework::DDim &y_dim, const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *intermediate_out, const framework::Tensor *out, const framework::Tensor *dout, int axis, - framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { + framework::Tensor *dx, framework::Tensor *dy, + framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op) { size_t N = static_cast(framework::product(x_dim)); platform::ForRange for_range( ctx.template device_context(), N); for_range( - FusedElemwiseAndActGradNoBroadcast{ + FusedElemwiseAndActGradNoBroadcast{ x->data(), y->data(), intermediate_out ? intermediate_out->data() : nullptr, - out->data(), dout->data(), dx_op, dy_op, + out->data(), dout->data(), dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), + dintermediate == nullptr ? nullptr : dintermediate->mutable_data( + ctx.GetPlace())}); } -template -static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, - const T *intermediate_out, - const T *out, const T *dout, - int h, int w, DX_OP dx_op, - DY_OP dy_op, T *dx, T *dy) { +template +static void FusedElemwiseAndActGradBroadcast1CPU( + const T *x, const T *y, const T *intermediate_out, const T *out, + const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int64_t tmp_out_idx, x_idx, y_idx; for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { @@ -1055,9 +1072,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, if (dx != nullptr) { T tmp = UseIntermediateOut - ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); if (BcastY) { dx[x_idx] = tmp; @@ -1071,9 +1090,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, } if (dy != nullptr) { T tmp = UseIntermediateOut - ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); if (BcastY) { if (i == 0) { dy[y_idx] = tmp; @@ -1084,18 +1105,34 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, dy[y_idx] = tmp; } } + if (d_intermediate != nullptr) { + T tmp = UseIntermediateOut + ? dintermediate_op.UseIntermediateOut( + x[x_idx], intermediate_out[tmp_out_idx], out[offset], + dout[offset]) + : dintermediate_op.Recompute(x[x_idx], y[y_idx], + out[offset], dout[i]); + if (SameShapeOfIntermediateOutAndOut) { + d_intermediate[tmp_out_idx] = tmp; + } else { + if (i == 0) { + d_intermediate[tmp_out_idx] = tmp; + } else { + d_intermediate[tmp_out_idx] += tmp; + } + } + } } } } -template -static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, - const T *intermediate_out, - const T *out, const T *dout, - int pre, int n, int post, - DX_OP dx_op, DY_OP dy_op, - T *dx, T *dy) { +template +static void FusedElemwiseAndActGradBroadcast2CPU( + const T *x, const T *y, const T *intermediate_out, const T *out, + const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int64_t tmp_out_idx, x_idx, y_idx; for (int i = 0; i < pre; ++i) { for (int j = 0; j < n; ++j) { @@ -1112,9 +1149,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, if (dx != nullptr) { T tmp = UseIntermediateOut - ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); if (BcastY) { dx[x_idx] = tmp; @@ -1128,9 +1167,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, } if (dy != nullptr) { T tmp = UseIntermediateOut - ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); if (BcastY) { if (i == 0 && k == 0) { dy[y_idx] = tmp; @@ -1141,21 +1182,40 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, dy[y_idx] = tmp; } } + if (d_intermediate != nullptr) { + T tmp = UseIntermediateOut + ? dintermediate_op.UseIntermediateOut( + x[x_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dintermediate_op.Recompute(x[x_idx], y[y_idx], + out[offset], dout[i]); + if (SameShapeOfIntermediateOutAndOut) { + d_intermediate[tmp_out_idx] = tmp; + } else { + if (i == 0) { + d_intermediate[tmp_out_idx] = tmp; + } else { + d_intermediate[tmp_out_idx] += tmp; + } + } + } } } } } #ifdef __NVCC__ -template +template static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( const T *x, const T *y, const T *intermediate_out, const T *out, - const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int j = blockIdx.x; int i = threadIdx.x; int tid = threadIdx.x; - T val(0); + T val(0), inter_val(0); int64_t tmp_out_idx, x_idx, y_idx; do { @@ -1170,10 +1230,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( } if (dx != nullptr) { - T tmp = UseIntermediateOut - ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + T tmp = + UseIntermediateOut + ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]); if (BcastY) { dx[x_idx] = tmp; @@ -1182,23 +1244,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( } } if (dy != nullptr) { - T tmp = UseIntermediateOut - ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + T tmp = + UseIntermediateOut + ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]); if (BcastY) { val += tmp; } else { dy[y_idx] = tmp; } } + if (d_intermediate != nullptr) { + T tmp = UseIntermediateOut + ? dintermediate_op.UseIntermediateOut( + y[y_idx], intermediate_out[tmp_out_idx], out[offset], + dout[offset]) + : dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); + if (SameShapeOfIntermediateOutAndOut) { + d_intermediate[tmp_out_idx] = tmp; + } else { + inter_val += tmp; + } + } i += ELEMWISE_MAX_BLOCK_DIM; } while (i < h); + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; if (BcastY) { if (dy) { - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; @@ -1206,41 +1283,49 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( } } else { if (dx) { - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dx[j] = val; } } } + if (!SameShapeOfIntermediateOutAndOut) { + if (d_intermediate) { + inter_val = paddle::platform::reduceSum(inter_val, tid, h); + if (threadIdx.x == 0) { + d_intermediate[j] = inter_val; + } + } + } } -template -static void FusedElemwiseAndActGradBroadcast1CUDA(cudaStream_t stream, - const T *x, const T *y, - const T *intermediate_out, - const T *out, const T *dout, - int h, int w, DX_OP dx_op, - DY_OP dy_op, T *dx, T *dy) { +template +static void FusedElemwiseAndActGradBroadcast1CUDA( + cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, + const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int gird_size = w; FusedElemwiseAndActGradBroadcast1CUDAKernel< - T, DX_OP, DY_OP, UseIntermediateOut, BcastY, + T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY, SameShapeOfIntermediateOutAndOut><<>>( - x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dx, dy); + x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op, + dx, dy, d_intermediate); } -template +template static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( const T *x, const T *y, const T *intermediate_out, const T *out, - const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, T *dx, - T *dy) { + const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { int tid = threadIdx.x; int j = blockIdx.x; - T val(0); + T val(0), inter_val(0); int ttid = tid; int64_t tmp_out_idx, x_idx, y_idx; while (true) { @@ -1259,10 +1344,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( } if (dx != nullptr) { - T tmp = UseIntermediateOut - ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + T tmp = + UseIntermediateOut + ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]); if (BcastY) { dx[x_idx] = tmp; @@ -1271,24 +1358,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( } } if (dy != nullptr) { - T tmp = UseIntermediateOut - ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], - out[offset], dout[offset]) - : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + T tmp = + UseIntermediateOut + ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx], + intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]); if (BcastY) { val += tmp; } else { dy[y_idx] = tmp; } } - + if (d_intermediate != nullptr) { + T tmp = UseIntermediateOut + ? dintermediate_op.UseIntermediateOut( + y[y_idx], intermediate_out[tmp_out_idx], out[offset], + dout[offset]) + : dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset], + dout[offset]); + if (SameShapeOfIntermediateOutAndOut) { + d_intermediate[tmp_out_idx] = tmp; + } else { + inter_val += tmp; + } + } ttid += ELEMWISE_MAX_BLOCK_DIM; } + int h = pre * post; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; if (BcastY) { if (dy) { - int h = pre * post; - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; @@ -1296,40 +1397,51 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( } } else { if (dx) { - int h = pre * post; - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; val = paddle::platform::reduceSum(val, tid, h); if (threadIdx.x == 0) { dx[j] = val; } } } + if (!SameShapeOfIntermediateOutAndOut) { + if (d_intermediate) { + inter_val = paddle::platform::reduceSum(inter_val, tid, h); + if (threadIdx.x == 0) { + d_intermediate[j] = inter_val; + } + } + } } -template +template static void FusedElemwiseAndActGradBroadcast2CUDA( cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op, - DY_OP dy_op, T *dx, T *dy) { + DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, + T *dintermediate) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int gird_size = n; FusedElemwiseAndActGradBroadcast2CUDAKernel< - T, DX_OP, DY_OP, UseIntermediateOut, BcastY, + T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY, SameShapeOfIntermediateOutAndOut><<>>( - x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, dx, dy); + x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, + dintermediate_op, dx, dy, dintermediate); } #endif template void FusedElemwiseAndActGradComputeWithBroadcast( const framework::ExecutionContext &ctx, const framework::DDim &x_dim, const framework::DDim &y_dim_untrimed, const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *intermediate_out, const framework::Tensor *out, const framework::Tensor *dout, int axis, - framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { + framework::Tensor *dx, framework::Tensor *dy, + framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op) { axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); axis = (y_dim.size() == 0) ? x_dim.size() : axis; @@ -1341,70 +1453,82 @@ void FusedElemwiseAndActGradComputeWithBroadcast( int w = n; if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ - FusedElemwiseAndActGradBroadcast1CUDA( ctx.template device_context().stream(), x->data(), y->data(), intermediate_out == nullptr ? nullptr : intermediate_out->data(), - out->data(), dout->data(), h, w, dx_op, dy_op, + out->data(), dout->data(), h, w, dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), + dintermediate == nullptr ? nullptr : dintermediate->mutable_data( + ctx.GetPlace())); #endif } else { - FusedElemwiseAndActGradBroadcast1CPU( x->data(), y->data(), intermediate_out == nullptr ? nullptr : intermediate_out->data(), - out->data(), dout->data(), h, w, dx_op, dy_op, + out->data(), dout->data(), h, w, dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), + dintermediate == nullptr ? nullptr : dintermediate->mutable_data( + ctx.GetPlace())); } } else { if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ - FusedElemwiseAndActGradBroadcast2CUDA( ctx.template device_context().stream(), x->data(), y->data(), intermediate_out == nullptr ? nullptr : intermediate_out->data(), out->data(), dout->data(), pre, n, post, dx_op, dy_op, + dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), + dintermediate == nullptr ? nullptr : dintermediate->mutable_data( + ctx.GetPlace())); #endif } else { - FusedElemwiseAndActGradBroadcast2CPU( x->data(), y->data(), intermediate_out == nullptr ? nullptr : intermediate_out->data(), out->data(), dout->data(), pre, n, post, dx_op, dy_op, + dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()), + dintermediate == nullptr ? nullptr : dintermediate->mutable_data( + ctx.GetPlace())); } } } template + typename DIntermediate_OP, bool UseIntermediateOut, + bool SameShapeOfIntermediateOutAndOut> void FusedElemwiseAndActGradComputeEx( const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *out, const framework::Tensor *intermediate_out, const framework::Tensor *dout, - int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, - DY_OP dy_op) { + int axis, framework::Tensor *dx, framework::Tensor *dy, + framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op, + DIntermediate_OP dintermediate_op) { const framework::DDim &x_dim = x->dims(); const framework::DDim &y_dim = y->dims(); if (UseIntermediateOut) { PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr"); } if (x_dim == y_dim) { - FusedElemwiseAndActGradComputeNoBroadcast( + FusedElemwiseAndActGradComputeNoBroadcast< + DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut>( ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy, - dx_op, dy_op); + dintermediate, dx_op, dy_op, dintermediate_op); } else { // Y is a scalar bool bcast_y = x_dim.size() >= y_dim.size(); if (x_dim.size() == y_dim.size()) { @@ -1420,16 +1544,16 @@ void FusedElemwiseAndActGradComputeEx( // z = f1(f2(x, y)) if (bcast_y) { // Y should be broadcast. FusedElemwiseAndActGradComputeWithBroadcast< - DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, true /*BcastY*/, - SameShapeOfIntermediateOutAndOut>(ctx, x_dim, y_dim, x, y, - intermediate_out, out, dout, axis, - dx, dy, dx_op, dy_op); + DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, + true /*BcastY*/, SameShapeOfIntermediateOutAndOut>( + ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy, + dintermediate, dx_op, dy_op, dintermediate_op); } else { FusedElemwiseAndActGradComputeWithBroadcast< - DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, false /*BcastY*/, - SameShapeOfIntermediateOutAndOut>(ctx, y_dim, x_dim, x, y, - intermediate_out, out, dout, axis, - dx, dy, dx_op, dy_op); + DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, + false /*BcastY*/, SameShapeOfIntermediateOutAndOut>( + ctx, y_dim, x_dim, x, y, intermediate_out, out, dout, axis, dx, dy, + dintermediate, dx_op, dy_op, dintermediate_op); } } } @@ -1444,7 +1568,7 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx, framework::Tensor *intermediate_out) { if (KeepIntermediateOut) { PADDLE_ENFORCE(intermediate_out, - "The keep_intermediate_value is opened, " + "The save_intermediate_out is opened, " "intermediate_out should not be nullptr."); } diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc index 9a297d03cfb041e584159a5fc5ba214f8ac404b4..3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7 100644 --- a/paddle/fluid/operators/extract_rows_op.cc +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { auto &in = scope.FindVar(Input("X"))->Get(); auto out = scope.FindVar(Output("Out"))->GetMutable(); - auto in_rows = in.rows(); + auto &in_rows = in.rows(); auto out_dim = framework::make_ddim( std::vector{static_cast(in_rows.size()), 1}); auto dst_ptr = out->mutable_data(out_dim, in.place()); diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc index b54f0091b3fe21222b4690f4dcff1c081d4799e7..d88ef15949da3809bffe41e4bf303d1fee568675 100644 --- a/paddle/fluid/operators/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc @@ -13,18 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/fused_elemwise_activation_op.h" -#include -#include namespace paddle { namespace operators { -/* - * Whether the compound function is Unary(Binary(X, Y)). - * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final - * out. - */ -static bool IsUnaryCompound(const std::vector &functor_list) { +bool IsUnaryCompound(const std::vector &functor_list) { PADDLE_ENFORCE_EQ(functor_list.size(), 2); static std::unordered_set binary_fun = { "elementwise_add", "elementwise_mul", "elementwise_add_grad", @@ -32,10 +25,17 @@ static bool IsUnaryCompound(const std::vector &functor_list) { return binary_fun.count(functor_list[1]) != 0; } -/* - * Whether the Input(X) could be absent. - */ -static bool InputXCanBeAbsent(const std::vector &functor_list) { +bool HasInPlaceUnary(const std::vector &functor_list) { + PADDLE_ENFORCE_EQ(functor_list.size(), 2); + static std::unordered_set InplaceOpSet = {"relu", "relu_grad"}; + bool is_in_place = false; + for (auto &func_name : functor_list) { + is_in_place |= (InplaceOpSet.count(func_name) == 1); + } + return is_in_place; +} + +bool InputXCanBeAbsent(const std::vector &functor_list) { PADDLE_ENFORCE_EQ(functor_list.size(), 2); static std::unordered_set binary_fun = {"elementwise_add_grad"}; return binary_fun.count(functor_list[0]) != 0 || @@ -86,20 +86,12 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { // Whether the shape of Y is a continuous subsequence of X, // For more information please refer to the op's introduction. - bool bcast_y = x_dim.size() >= y_dim.size(); - if (x_dim.size() == y_dim.size()) { - for (int i = 0; i < x_dim.size(); ++i) { - if (x_dim[i] < y_dim[i]) { - bcast_y = false; - break; - } - } - } + bool bcast_y = IsBcastY(x_dim, y_dim); auto &out_dim = bcast_y ? x_dim : y_dim; std::string out_lod = bcast_y ? "X" : "Y"; - if (ctx->Attrs().Get("keep_intermediate_value")) { + if (ctx->Attrs().Get("save_intermediate_out")) { PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"), "Output(IntermediateOut) of FusedElemwiseActivationOp " "should not be null."); @@ -123,6 +115,20 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { ctx->ShareLoD(out_lod, /*->*/ "Out"); } + static bool IsBcastY(const framework::DDim &x_dim, + const framework::DDim &y_dim) { + bool bcast_y = x_dim.size() >= y_dim.size(); + if (x_dim.size() == y_dim.size()) { + for (int i = 0; i < x_dim.size(); ++i) { + if (x_dim[i] < y_dim[i]) { + bcast_y = false; + break; + } + } + } + return bcast_y; + } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -157,17 +163,7 @@ class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker { AddAttr("scale", "scale is used by scale_op, the default value is 0.0.") .SetDefault(0.0); - AddAttr( - "recomputation", - "Whether to recompute the Out." - "The computation of fused_elemwise_activation_grad has two methods to " - "get the dx and dy, one is to use the 'Out', and the other is not. " - "The former method will save the time of recomputing the 'Out', but it " - "must occupy the memory to store the 'out'. While, the later method " - "can avoid occupying the memory, but it must recompute the 'Out'. " - "It is useful for Unary(Binary(X, Y)). The default value is true.") - .SetDefault(true); - AddAttr("keep_intermediate_value", + AddAttr("save_intermediate_out", "Whether to save the intermediate_out.") .SetDefault(false); AddAttr>("functor_list", @@ -227,30 +223,38 @@ class FusedElemwiseActivationGradMaker protected: std::unique_ptr Apply() const override { - auto *op_desc_ptr = new framework::OpDesc(); - op_desc_ptr->SetType(this->ForwardOpType() + "_grad"); + auto *grad_op = new framework::OpDesc(); + grad_op->SetType(this->ForwardOpType() + "_grad"); for (auto &input_param : this->InputNames()) { - op_desc_ptr->SetInput(input_param, this->Input(input_param)); - op_desc_ptr->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param, true)); + grad_op->SetInput(input_param, this->Input(input_param)); + grad_op->SetOutput(framework::GradVarName(input_param), + this->InputGrad(input_param, true)); } - for (auto &output_param : this->OutputNames()) { - op_desc_ptr->SetInput(output_param, this->Output(output_param)); - op_desc_ptr->SetInput(framework::GradVarName(output_param), - this->OutputGrad(output_param)); - } + grad_op->SetInput("Out", this->Output("Out")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op_desc_ptr->SetAttrMap(this->Attrs()); + grad_op->SetAttrMap(this->Attrs()); std::vector functor_names = - boost::get>( - op_desc_ptr->GetAttr("functor_list")); + boost::get>(grad_op->GetAttr("functor_list")); + functor_names[0] += "_grad"; functor_names[1] += "_grad"; - op_desc_ptr->SetAttr("functor_list", functor_names); - return std::unique_ptr(op_desc_ptr); + grad_op->SetAttr("functor_list", functor_names); + + if (boost::get(grad_op->GetAttr("save_intermediate_out"))) { + PADDLE_ENFORCE_NE(Output("IntermediateOut").size(), 0); + grad_op->SetInput("IntermediateOut", this->Output("IntermediateOut")); + grad_op->SetOutput(framework::GradVarName("IntermediateOut"), + this->OutputGrad("IntermediateOut")); + } else { + grad_op->SetInput("IntermediateOut", {}); + grad_op->SetOutput(framework::GradVarName("IntermediateOut"), {}); + } + + return std::unique_ptr(grad_op); } }; @@ -261,56 +265,65 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@Grad) should not be null"); - if (ctx->Attrs().Get("keep_intermediate_value")) { + + auto functor_list = + ctx->Attrs().Get>("functor_list"); + + if (ctx->Attrs().Get("save_intermediate_out")) { PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"), "Input(IntermediateOut) should not be null"); } else { - PADDLE_ENFORCE_EQ(ctx->Inputs(framework::GradVarName("Out")).size(), 1); + if (!InputXCanBeAbsent(functor_list)) { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + } } - auto funtor_list = - ctx->Attrs().Get>("functor_list"); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); + auto inter_grad_name = framework::GradVarName("IntermediateOut"); if (ctx->HasOutput(x_grad_name)) { if (ctx->HasInputs("X")) { ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); ctx->ShareLoD("X", x_grad_name); } else { - // Node: If "X" is absence, the shape of Y should be a continuous - // subsequence of X, if not, we could not infer the shape of dx. - // Currently, only when Binary is elementwise_add or elementwise_sub, // the "X" could be absent. - PADDLE_ENFORCE(InputXCanBeAbsent(funtor_list), + PADDLE_ENFORCE(InputXCanBeAbsent(functor_list), "Only when BinaryFunctor is elementwise_add, the 'X' " "could be absent."); - // For Unary(Binary(X, Y)), IntermediateOut should not be empty. - if (IsUnaryCompound(funtor_list)) { - PADDLE_ENFORCE( - ctx->HasInputs("IntermediateOut"), - "If the compound_functor is Unary(Binary(X, Y)) and Binary " - "is elementwise_add, the intermediate_out must be not absent."); - } + // Node: If "X" is absence, the shape of Y should be a continuous + // subsequence of X, otherwise, we could not infer the shape of dx. ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(framework::GradVarName("Out"))); ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name); } } + if (ctx->HasOutput(y_grad_name)) { PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y")); ctx->ShareLoD("Y", y_grad_name); } + + if (ctx->HasOutput(inter_grad_name)) { + // For Unary(Binary(X, Y)), IntermediateOut should not be empty. + if (IsUnaryCompound(functor_list)) { + ctx->SetOutputDim(inter_grad_name, + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->ShareLoD(framework::GradVarName("Out"), inter_grad_name); + } else { + ctx->SetOutputDim(inter_grad_name, ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", inter_grad_name); + } + } } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); auto input_data_type_index = ctx.Input("Y")->type(); auto input_data_type = framework::ToDataType(input_data_type_index); return framework::OpKernelType(input_data_type, ctx.GetPlace()); diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h index 6321541aab7e31cd703289bb8951245215ecb3e2..5ae9aea959c268985c17643f2f47199c852c2bcb 100644 --- a/paddle/fluid/operators/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused_elemwise_activation_op.h @@ -26,6 +26,24 @@ limitations under the License. */ namespace paddle { namespace operators { +/** + * Whether the compound function is Unary(Binary(X, Y)). + * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final + * out. + */ +bool IsUnaryCompound(const std::vector &functor_list); + +/** + * For the in-place unary functor, the inputs of op_desc only have Out and + * Out@Grad. + */ +bool HasInPlaceUnary(const std::vector &functor_list); + +/** + * Whether the Input(X) could be absent. + */ +bool InputXCanBeAbsent(const std::vector &functor_list); + template static void RunBinaryCompoundFunctor( @@ -39,7 +57,7 @@ static void RunBinaryCompoundFunctor( paddle::operators::math::BinaryCompoundFunctor compound_func(binary_functor, unary_functor); int axis = ctx.Attr("axis"); - if (ctx.Attr("keep_intermediate_value")) { + if (ctx.Attr("save_intermediate_out")) { FusedElemwiseAndActComputeEx, @@ -71,7 +89,7 @@ static void RunUnaryCompoundFunctors( paddle::operators::math::UnaryCompoundFunctor compound_func(unary_functor, binary_functor); - if (ctx.Attr("keep_intermediate_value")) { + if (ctx.Attr("save_intermediate_out")) { FusedElemwiseAndActComputeEx, @@ -89,7 +107,7 @@ static void RunUnaryCompoundFunctors( } template + typename UnaryFunctor, typename UnaryGradFunctor, bool InPlace> static void RunBinaryCompoundGradFunctors( const framework::ExecutionContext &ctx, const BinaryGradFunctor &binary_grad_functor, @@ -98,7 +116,7 @@ static void RunBinaryCompoundGradFunctors( const framework::Tensor *in_y, const framework::Tensor *in_out, const framework::Tensor *in_intermediate_out, const framework::Tensor *in_out_grad, framework::Tensor *x_grad, - framework::Tensor *y_grad) { + framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) { // Z = Binary(X, Unary(Y)) int axis = ctx.Attr("axis"); @@ -107,32 +125,40 @@ static void RunBinaryCompoundGradFunctors( UnaryFunctor>; using BinaryCompoundDyFunctor = paddle::operators::math::BinaryCompoundGradDyFunctor< - T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor>; + T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>; + using BinaryCompoundDIntermedaiteOutFunctor = + paddle::operators::math::BinaryCompoundGradDIntermedaiteOutFunctor< + T, BinaryGradFunctor, UnaryFunctor>; if (in_intermediate_out) { FusedElemwiseAndActGradComputeEx< DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, - true /*UseIntermediateOut*/, + BinaryCompoundDIntermedaiteOutFunctor, true /*UseIntermediateOut*/, false /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, - y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), + y_grad, d_intermediate_out, + BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, - unary_grad_functor)); + unary_grad_functor), + BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor, + unary_functor)); } else { FusedElemwiseAndActGradComputeEx< DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, - false /*UseIntermediateOut*/, + BinaryCompoundDIntermedaiteOutFunctor, false /*UseIntermediateOut*/, false /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, - y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), + y_grad, d_intermediate_out, + BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, - unary_grad_functor)); + unary_grad_functor), + BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor, + unary_functor)); } } template + typename BinaryFunctor, typename BinaryGradFunctor, bool InPlace> static void RunUnaryCompoundGradFunctors( const framework::ExecutionContext &ctx, const UnaryGradFunctor &unary_grad_functor, @@ -141,36 +167,44 @@ static void RunUnaryCompoundGradFunctors( const framework::Tensor *in_y, const framework::Tensor *in_out, const framework::Tensor *in_intermediate_out, const framework::Tensor *in_out_grad, framework::Tensor *x_grad, - framework::Tensor *y_grad) { + framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) { // Z = Unary(Binary(X, Y)) int axis = ctx.Attr("axis"); using UnaryCompoundDxFunctor = paddle::operators::math::UnaryCompoundGradDxFunctor< - T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; using UnaryCompoundDyFunctor = paddle::operators::math::UnaryCompoundGradDyFunctor< - T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>; + using UnaryCompoundDIntermediateFunctor = + paddle::operators::math::UnaryCompoundGradDIntermediateFunctor< + T, UnaryGradFunctor, BinaryFunctor, InPlace>; if (in_intermediate_out) { FusedElemwiseAndActGradComputeEx< DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor, - true /*UseIntermediateOut*/, true /*SameShapeOfIntermediateOutAndOut*/>( + UnaryCompoundDIntermediateFunctor, true /*UseIntermediateOut*/, + true /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, - y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, - binary_grad_functor), + y_grad, d_intermediate_out, + UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, + binary_grad_functor), UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, - binary_grad_functor)); + binary_grad_functor), + UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor)); } else { - FusedElemwiseAndActGradComputeEx( + FusedElemwiseAndActGradComputeEx< + DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor, + UnaryCompoundDIntermediateFunctor, false /*UseIntermediateOut*/, + true /*SameShapeOfIntermediateOutAndOut*/>( ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, - y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, - binary_grad_functor), + y_grad, d_intermediate_out, + UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, + binary_grad_functor), UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, - binary_grad_functor)); + binary_grad_functor), + UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor)); } } @@ -226,72 +260,67 @@ static void RunFunctors(const framework::ExecutionContext &ctx, } } -template -static void RunGradFunctors(const framework::ExecutionContext &ctx, - const framework::Tensor *in_x, - const framework::Tensor *in_y, - const framework::Tensor *in_out, - const framework::Tensor *in_intermediate_out, - const framework::Tensor *in_out_grad, - framework::Tensor *x_grad, - framework::Tensor *y_grad) { +template +static void RunGradFunctors( + const framework::ExecutionContext &ctx, const framework::Tensor *in_x, + const framework::Tensor *in_y, const framework::Tensor *in_out, + const framework::Tensor *in_intermediate_out, + const framework::Tensor *in_out_grad, framework::Tensor *x_grad, + framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) { auto &functors = ctx.Attr>("functor_list"); auto funcs_str = functors[0] + "," + functors[1]; - // TODO(zcd): The following code can be refined. for example, use registrition if (funcs_str == "elementwise_add_grad,scale_grad") { // The backward of Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundGradFunctors, - paddle::operators::math::ScaleFunctor, - paddle::operators::math::ScaleGradFunctor>( + RunBinaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::AddGradFunctor, + paddle::operators::math::ScaleFunctor, + paddle::operators::math::ScaleGradFunctor, InPlace>( ctx, paddle::operators::math::AddGradFunctor(), paddle::operators::math::ScaleFunctor(scale), paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad); + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "scale_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) T scale = static_cast(ctx.Attr("scale")); - RunUnaryCompoundGradFunctors, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, - ReComputation /*Recomputation*/>( + RunUnaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::ScaleGradFunctor, + paddle::operators::math::AddFunctor, + paddle::operators::math::AddGradFunctor, InPlace>( ctx, paddle::operators::math::ScaleGradFunctor(scale), paddle::operators::math::AddFunctor(), paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad); + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "elementwise_add_grad,relu_grad") { - RunBinaryCompoundGradFunctors, - paddle::operators::math::ReluFunctor, - paddle::operators::math::ReluGradFunctor>( + RunBinaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::AddGradFunctor, + paddle::operators::math::ReluFunctor, + paddle::operators::math::ReluGradFunctor, InPlace>( ctx, paddle::operators::math::AddGradFunctor(), paddle::operators::math::ReluFunctor(), paddle::operators::math::ReluGradFunctor(), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad); + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "relu_grad,elementwise_add_grad") { - RunUnaryCompoundGradFunctors, - paddle::operators::math::AddFunctor, - paddle::operators::math::AddGradFunctor, - ReComputation /*Recomputation*/>( + RunUnaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::ReluGradFunctor, + paddle::operators::math::AddFunctor, + paddle::operators::math::AddGradFunctor, InPlace>( ctx, paddle::operators::math::ReluGradFunctor(), paddle::operators::math::AddFunctor(), paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad); + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else if (funcs_str == "elementwise_mul_grad,scale_grad") { // The backward of Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundGradFunctors, - paddle::operators::math::ScaleFunctor, - paddle::operators::math::ScaleGradFunctor>( + RunBinaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::MulGradFunctor, + paddle::operators::math::ScaleFunctor, + paddle::operators::math::ScaleGradFunctor, InPlace>( ctx, paddle::operators::math::MulGradFunctor(), paddle::operators::math::ScaleFunctor(scale), paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, - in_intermediate_out, in_out_grad, x_grad, y_grad); + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else { PADDLE_THROW("%s has not been implemented.", funcs_str); } @@ -313,9 +342,9 @@ class FusedElemwiseActivationKernel : public framework::OpKernel { std::vector outputs; outputs.emplace_back(output); - if (ctx.Attr("keep_intermediate_value")) { + if (ctx.Attr("save_intermediate_out")) { PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"), - "The keep_intermediate_value is enable, so the " + "The save_intermediate_out is enable, so the " "IntermediateOut should not be empty."); auto intermediate_out = ctx.Output("IntermediateOut"); outputs.emplace_back(intermediate_out); @@ -331,65 +360,63 @@ template class FusedElemwiseActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.Input("X"); - auto y = ctx.Input("Y"); - + auto in_y = ctx.Input("Y"); + PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr."); auto in_out = ctx.Input("Out"); + PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr."); auto in_out_grad = ctx.Input(framework::GradVarName("Out")); - + PADDLE_ENFORCE(in_out_grad != nullptr, + "Input(Out@Grad) should not be nullptr."); + framework::Tensor *in_x = + const_cast(ctx.Input("X")); framework::Tensor *x_grad = ctx.Output(framework::GradVarName("X")); framework::Tensor *y_grad = ctx.Output(framework::GradVarName("Y")); + framework::Tensor *d_intermediate_out = ctx.Output( + framework::GradVarName("IntermediateOut")); - PADDLE_ENFORCE(y != nullptr, "Input(Y) should not be nullptr."); - - if (ctx.Attr("recomputation")) { - PADDLE_ENFORCE( - x != nullptr, - "The recomputation is opened, so Input(X) should not be absent."); - } else { - PADDLE_ENFORCE(in_out != nullptr, - "The recomputation is disabled, so the Input('Out') " - "should not be empty."); - } - - framework::Tensor *in_x; auto functor_list = ctx.Attr>("functor_list"); - // If functor_list contains elementwise_add, the backward doesn't use - // in_x, and in_outs. - if (x == nullptr) { - PADDLE_ENFORCE(functor_list[0] == "elementwise_add_grad" || - functor_list[1] == "elementwise_add_grad", - "Only when the compoundfunctor contains " - "elementwise_add_grad, the 'X' could be absent."); - in_x = const_cast(in_out_grad); - in_out = const_cast(in_out_grad); - } else { - in_x = const_cast(x); - } - - framework::Tensor *in_intermediate_out; - if (ctx.Attr("keep_intermediate_value")) { + // Get intermediate_out + framework::Tensor *in_intermediate_out = nullptr; + if (ctx.Attr("save_intermediate_out")) { + // if save_intermediate_out is true, for Unary(Binary(x, y)) and + // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to + // recompute. in_intermediate_out = const_cast( ctx.Input("IntermediateOut")); PADDLE_ENFORCE(in_intermediate_out != nullptr, - "The option of 'keep_intermediate_value' is opened, " + "The option of 'save_intermediate_out' is opened, " "so the number of 'Out' should be two."); } else { - in_intermediate_out = nullptr; + if (!InputXCanBeAbsent(functor_list)) { + PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null."); + } + } + + // Get in_x + if (ctx.HasInput("X")) { + PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr."); + } else { + // If functor_list contains elementwise_add, the backward doesn't use + // in_x, in_y and in_out. + PADDLE_ENFORCE(InputXCanBeAbsent(functor_list), + "Only when the compoundfunctor contains " + "elementwise_add_grad, the 'X' could be absent."); + in_x = const_cast(in_out_grad); } - if (ctx.Attr("recomputation")) { - RunGradFunctors( - ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, - y_grad); + bool has_in_place = HasInPlaceUnary(functor_list); + if (has_in_place) { + RunGradFunctors( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, + y_grad, d_intermediate_out); } else { - RunGradFunctors( - ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, - y_grad); + RunGradFunctors( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, + y_grad, d_intermediate_out); } } }; diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index b3f7e0c0097b469998049a1db65d56a28cf02b5e..8eab83fcd247fcd099ae1fa5dab1e67c2081bf9c 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -26,6 +29,61 @@ struct CopyRange { size_t end; }; +struct LoDTensorToArrayFunctor; + +template +struct LoDTensorToArrayFunctorImpl { + const LoDTensorToArrayFunctor *prev_functor_; + DeviceContext *dev_ctx_; + template + void apply(); +}; + +struct LoDTensorToArrayFunctor : public boost::static_visitor { + std::vector ref_inputs_; + mutable std::vector outputs_; + const framework::Tensor &input_; + + explicit LoDTensorToArrayFunctor(const framework::Tensor &input) + : input_(input) {} + + void AddOutput(framework::Tensor *t) { + outputs_.emplace_back(t); + ref_inputs_.emplace_back(t); + } + + template + void operator()(Place place) const { + auto &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = pool.Get(place); + if (std::is_same::value) { + Apply(static_cast(dev_ctx)); + } else { +#ifdef PADDLE_WITH_CUDA + Apply(static_cast(dev_ctx)); +#else + PADDLE_THROW("Not compiled with cuda"); +#endif + } + } + + template + void Apply(DeviceContext *dev_ctx) const { + LoDTensorToArrayFunctorImpl func; + func.prev_functor_ = this; + func.dev_ctx_ = dev_ctx; + framework::VisitDataType(framework::ToDataType(input_.type()), func); + } +}; + +template +template +void LoDTensorToArrayFunctorImpl::apply() { + math::ConcatGradFunctor func; + func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0, + &prev_functor_->outputs_); +} + class LoDTensorToArrayOp : public framework::OperatorBase { public: LoDTensorToArrayOp(const std::string &type, @@ -72,6 +130,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase { copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); } } + + auto &outputs = *const_cast(scope) + .Var() + ->GetMutable>(); + for (size_t i = 0; i < max_seq_len; ++i) { auto &ranges = copy_ranges[i]; size_t height = std::accumulate( @@ -90,17 +153,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase { // out[i][offset: offset+len] = x[each_range.begin: each_range.end] auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::TensorCopy(x.Slice(static_cast(each_range.begin), - static_cast(each_range.end)), - x.place(), dev_ctx, &slice); + outputs.insert({each_range.begin, slice}); offset += len; } } + + LoDTensorToArrayFunctor functor(x); + for (auto &out_pair : outputs) { + functor.AddOutput(&out_pair.second); + } + platform::VisitPlace(place, functor); } }; diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/fluid/operators/math/compound_functors.h index 1d32a9585b08a9d27730076d9f7baa6056270a42..7aba4a917cdea50f95bcc7627f707257606fc927 100644 --- a/paddle/fluid/operators/math/compound_functors.h +++ b/paddle/fluid/operators/math/compound_functors.h @@ -22,11 +22,11 @@ namespace paddle { namespace operators { namespace math { +// Z = BinaryFunctor(X, UnaryFunctor(Y)) template struct BinaryCompoundFunctor { BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2) : func1_(func1), func2_(func2) {} - // Z = BinaryFunctor(X, UnaryFunctor(Y)) inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); } @@ -40,11 +40,11 @@ struct BinaryCompoundFunctor { UnaryFunctor func2_; }; +// Z = UnaryFunctor(BinaryFunctor(X, Y)) template struct UnaryCompoundFunctor { UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2) : func1_(func1), func2_(func2) {} - // Z = UnaryFunctor(BinaryFunctor(X, Y)) inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); } @@ -58,23 +58,19 @@ struct UnaryCompoundFunctor { BinaryFunctor func2_; }; -// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get -// the dx, one is to use the 'out', and the other is not to use it. -// the former method will save the time of recomputing the -// 'out', but it must occupy the memory to store the 'out'. -// While the later method can avoid occupying this memory, -// but it must recompute the 'out'. +// Z = BinaryFunctor(X, UnaryFunctor(Y)) template struct BinaryCompoundGradDxFunctor { BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun, const UnaryFun &unary_fun) : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { return dout * d_binary_fun_.Dx(x, unary_fun_(y)); } - inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, + T dout) { return dout * d_binary_fun_.Dx(x, intermediate_out); } @@ -83,8 +79,9 @@ struct BinaryCompoundGradDxFunctor { UnaryFun unary_fun_; }; +// Z = BinaryFunctor(X, UnaryFunctor(Y)) template + typename DUnaryFun, bool InPlace> struct BinaryCompoundGradDyFunctor { BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, const UnaryFun &unary_fun, @@ -93,13 +90,19 @@ struct BinaryCompoundGradDyFunctor { unary_fun_(unary_fun), d_unary_fun_(d_unary_fun) {} - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { - return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_(y); + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y); } - inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { - return dout * d_binary_fun_.Dy(x, intermediate_out) * - d_unary_fun_(y, intermediate_out); + inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, + T dout) { + if (InPlace) { + return dout * d_binary_fun_.Dy(x, intermediate_out) * + d_unary_fun_.UseOut(intermediate_out); + } else { + return dout * d_binary_fun_.Dy(x, intermediate_out) * + d_unary_fun_.UseXAndOut(y, intermediate_out); + } } private: @@ -108,8 +111,9 @@ struct BinaryCompoundGradDyFunctor { DUnaryFun d_unary_fun_; }; +// Z = UnaryFunctor(BinaryFunctor(X, Y)) template + typename DBinaryFun, bool InPlace> struct UnaryCompoundGradDxFunctor { UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, const BinaryFun &binary_fun, @@ -118,22 +122,23 @@ struct UnaryCompoundGradDxFunctor { binary_fun_(binary_fun), d_binary_fun_(d_binary_fun) {} - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { T base; - if (Recomputation) { - base = dout * d_unary_fun_(binary_fun_(x, y)); + if (InPlace) { + base = dout * d_unary_fun_.UseOut(out); } else { - base = dout * d_unary_fun_(binary_fun_(x, y), out); + base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out); } return base * d_binary_fun_.Dx(x, y); } - inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, + T dout) { T base; - if (Recomputation) { - base = dout * d_unary_fun_(intermediate_out); + if (InPlace) { + base = dout * d_unary_fun_.UseOut(out); } else { - base = dout * d_unary_fun_(intermediate_out, out); + base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out); } return base * d_binary_fun_.Dx(x, y); } @@ -144,8 +149,9 @@ struct UnaryCompoundGradDxFunctor { DBinaryFun d_binary_fun_; }; +// Z = UnaryFunctor(BinaryFunctor(X, Y)) template + typename DBinaryFun, bool InPlace> struct UnaryCompoundGradDyFunctor { UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, const BinaryFun &binary_fun, @@ -154,22 +160,23 @@ struct UnaryCompoundGradDyFunctor { binary_fun_(binary_fun), d_binary_fun_(d_binary_fun) {} - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { T base; - if (Recomputation) { - base = dout * d_unary_fun_(binary_fun_(x, y)); + if (InPlace) { + base = dout * d_unary_fun_.UseOut(out); } else { - base = dout * d_unary_fun_(binary_fun_(x, y), out); + base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out); } return base * d_binary_fun_.Dy(x, y); } - inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out, + T dout) { T base; - if (Recomputation) { - base = dout * d_unary_fun_(intermediate_out); + if (InPlace) { + base = dout * d_unary_fun_.UseOut(out); } else { - base = dout * d_unary_fun_(intermediate_out, out); + base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out); } return base * d_binary_fun_.Dy(x, y); } @@ -180,6 +187,56 @@ struct UnaryCompoundGradDyFunctor { DBinaryFun d_binary_fun_; }; +// Z = BinaryFunctor(X, UnaryFunctor(Y)) +template +struct BinaryCompoundGradDIntermedaiteOutFunctor { + BinaryCompoundGradDIntermedaiteOutFunctor(const DBinaryFun &d_binary_fun, + const UnaryFun &unary_fun) + : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} + + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + return dout * d_binary_fun_.Dy(x, unary_fun_(y)); + } + + inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out, + T dout) { + return dout * d_binary_fun_.Dy(x, intermediate_out); + } + + private: + DBinaryFun d_binary_fun_; + UnaryFun unary_fun_; +}; + +// Z = UnaryFunctor(BinaryFunctor(X, Y)) +template +struct UnaryCompoundGradDIntermediateFunctor { + UnaryCompoundGradDIntermediateFunctor(const DUnaryFun &d_unary_fun, + const BinaryFun &binary_fun) + : d_unary_fun_(d_unary_fun), binary_fun_(binary_fun) {} + + inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) { + if (InPlace) { + return dout * d_unary_fun_.UseOut(out); + } else { + return dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out); + } + } + + inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out, + T dout) { + if (InPlace) { + return dout * d_unary_fun_.UseOut(out); + } else { + return dout * d_unary_fun_.UseXAndOut(intermediate_out, out); + } + } + + private: + DUnaryFun d_unary_fun_; + BinaryFun binary_fun_; +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index c3c5c160db358d39aa3f841a2b1646a21c91440e..7b79f10e33d4474e279c6e46208722d6b52277fc 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -27,7 +27,7 @@ template class ConcatFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const std::vector& input, const int axis, + const std::vector& input, int axis, framework::Tensor* output) { // TODO(zcd): Add input data validity checking int num = input.size(); @@ -71,7 +71,7 @@ class ConcatGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, - const std::vector& ref_inputs, + const std::vector& ref_inputs, const int axis, std::vector* outputs) { // TODO(zcd): Add input data validity checking size_t num = outputs->size(); @@ -109,16 +109,11 @@ class ConcatGradFunctor { } } }; +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class ConcatGradFunctor; -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; - -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; +FOR_ALL_TYPES(DEFINE_FUNCTOR); } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 342379268be36cc5b532363e664f6e73990333e1..b59d86e661aff25eba8e770247e85845365d628b 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -118,7 +119,7 @@ template class ConcatFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const std::vector& input, const int axis, + const std::vector& input, int axis, framework::Tensor* output) { // TODO(zcd): Add input data validity checking int in_num = input.size(); @@ -192,8 +193,8 @@ class ConcatGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, - const std::vector& ref_inputs, - const int axis, std::vector* outputs) { + const std::vector& ref_inputs, + int axis, std::vector* outputs) { // TODO(zcd): Add input data validity checking int o_num = outputs->size(); int out_row = 1; @@ -261,15 +262,11 @@ class ConcatGradFunctor { } }; -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class ConcatGradFunctor -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; +FOR_ALL_TYPES(DEFINE_FUNCTOR); } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index e5d7d860b371677b3cfc67a57390bdee0d0ecc37..867a84fa873a2e90bdab7a5eecbb1755cb4b02d1 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -37,7 +37,7 @@ template class ConcatFunctor { public: void operator()(const DeviceContext& context, - const std::vector& input, const int axis, + const std::vector& input, int axis, framework::Tensor* output); }; @@ -57,10 +57,21 @@ template class ConcatGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ref_inputs, - const int axis, std::vector* outputs); + const std::vector& ref_inputs, + int axis, std::vector* outputs); }; } // namespace math } // namespace operators } // namespace paddle + +#define FOR_ALL_TYPES(macro) \ + macro(int); \ + macro(float); \ + macro(double); \ + macro(bool); \ + macro(int64_t); \ + macro(int16_t); \ + macro(uint8_t); \ + macro(int8_t); \ + macro(::paddle::platform::float16) diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h index b6f4ab93777f2b5eb13a4a3172028d87c4546017..47c771f7c5c01b651423c7886207abf4a4297019 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -85,26 +85,59 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, T *prev_output_value, int frame_size, ActivationType active_gate) { #ifdef __AVX__ - __m256 r_value_update_gate; - __m256 r_value_reset_gate; + __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f); + __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f); __m256 r_value_reset_output; - __m256 r_prev_out = _mm256_set1_ps(0.0f); - __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); - __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size); + __m256 r_prev_out = _mm256_set1_ps(0.0f), + r_prev_out_last = _mm256_set1_ps(0.0f); + T *update_gate = gate_value; + T *reset_gate = gate_value + frame_size; + int block = 8; + const int n = frame_size; + const int rest = n % block; + const int end = n - rest; + int i = 0; + + if (rest > 0) { + i = n - block; + r_value_update_gate_last = + _mm256_loadu_ps((const float *)(update_gate + i)); + r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i)); + if (prev_output_value) { + r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i)); + } + } - for (int i = 0; i < frame_size / 8; i++) { - r_value_update_gate = update_gate[i]; - r_value_reset_gate = reset_gate[i]; + for (i = 0; i < end; i += block) { + r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i)); + r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i)); if (prev_output_value) { - r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; + r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i)); } op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, &r_value_reset_output, active_gate); - update_gate[i] = r_value_update_gate; - reset_gate[i] = r_value_reset_gate; - (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output; + _mm256_storeu_ps(reinterpret_cast(update_gate + i), + r_value_update_gate); + _mm256_storeu_ps(reinterpret_cast(reset_gate + i), + r_value_reset_gate); + _mm256_storeu_ps(reinterpret_cast(reset_output_value + i), + r_value_reset_output); + } + + if (rest > 0) { + i = n - block; + + op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last, + &r_prev_out_last, &r_value_reset_output, active_gate); + + _mm256_storeu_ps(reinterpret_cast(update_gate + i), + r_value_update_gate_last); + _mm256_storeu_ps(reinterpret_cast(reset_gate + i), + r_value_reset_gate_last); + _mm256_storeu_ps(reinterpret_cast(reset_output_value + i), + r_value_reset_output); } #endif } @@ -115,26 +148,55 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, T *output_value, int frame_size, ActivationType active_node) { #ifdef __AVX__ - __m256 r_value_update_gate; - __m256 r_value_frame_state; - __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f); + __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f); + __m256 r_prev_out = _mm256_set1_ps(0.0f), + r_prev_out_last = _mm256_set1_ps(0.0f); __m256 r_output; - __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); - __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2); + T *update_gate = gate_value; + T *frame_state = gate_value + frame_size * 2; + int block = 8; + const int n = frame_size; + const int rest = n % block; + const int end = n - rest; + int i = 0; + + if (rest > 0) { + i = n - block; + r_value_update_gate_last = + _mm256_loadu_ps((const float *)(update_gate + i)); + r_value_frame_state_last = + _mm256_loadu_ps((const float *)(frame_state + i)); + if (prev_output_value) { + r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i)); + } + } - for (int i = 0; i < frame_size / 8; i++) { - r_value_update_gate = update_gate[i]; - r_value_frame_state = frame_state[i]; + for (i = 0; i < end; i += block) { + r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i)); + r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i)); if (prev_output_value) { - r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; + r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i)); } op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, &r_output, active_node); - frame_state[i] = r_value_frame_state; - (reinterpret_cast<__m256 *>(output_value))[i] = r_output; + _mm256_storeu_ps(reinterpret_cast(frame_state + i), + r_value_frame_state); + _mm256_storeu_ps(reinterpret_cast(output_value + i), r_output); + } + + if (rest > 0) { + i = n - block; + op_final_output(&r_value_update_gate_last, &r_value_frame_state_last, + &r_prev_out_last, &r_output, active_node); + + _mm256_storeu_ps(reinterpret_cast(frame_state + i), + r_value_frame_state_last); + _mm256_storeu_ps(reinterpret_cast(output_value + i), r_output); } + #endif } @@ -143,7 +205,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output, GRUMetaValue value, int frame_size, int batch_size, ActivationType active_gate) { for (int b = 0; b < batch_size; b++) { - if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + if (OpResetOutput::avx && (frame_size > static_cast(8 - 1)) && + (sizeof(T) == 4)) { hl_avx_gru_forward_reset_output( op_reset_output, value.gate_value, value.reset_output_value, value.prev_out_value, frame_size, active_gate); @@ -166,7 +229,8 @@ inline void forward_final_output(OpFinalOutput op_final_output, GRUMetaValue value, int frame_size, int batch_size, ActivationType active_node) { for (int b = 0; b < batch_size; b++) { - if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + if (OpFinalOutput::avx && (frame_size > static_cast(8 - 1)) && + (sizeof(T) == 4)) { hl_avx_gru_forward_final_output(op_final_output, value.gate_value, value.prev_out_value, value.output_value, frame_size, active_node); diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h index ddb01cdfc084f5ba2e9e573be461389f46fbe03f..955c0b6bad5f81e64e2e6d1f7521abec49d57721 100644 --- a/paddle/fluid/operators/math/functors.h +++ b/paddle/fluid/operators/math/functors.h @@ -58,9 +58,9 @@ template struct ScaleGradFunctor { explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {} - inline HOSTDEVICE T operator()(T x) { return coeff_; } - - inline HOSTDEVICE T operator()(T x, T out) { return coeff_; } + inline HOSTDEVICE T UseX(T x) { return coeff_; } + inline HOSTDEVICE T UseOut(T out) { return coeff_; } + inline HOSTDEVICE T UseXAndOut(T x, T out) { return coeff_; } private: T coeff_; @@ -73,9 +73,9 @@ struct ReluFunctor { template struct ReluGradFunctor { - inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; } - - inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; } + inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; } + inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; } + inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; } }; } // namespace math diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index f0f723dd120155648a9046b628bed2ecb078e040..ae51a53a7197950338ef773d63103fa13bf0a5f5 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -60,11 +60,9 @@ struct SelectedRowsAdd { auto out_place = context.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(out_place)); - memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); auto* in2_data = in2_value.data(); memory::Copy(boost::get(out_place), @@ -148,7 +146,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 97c2e69fe5327956fc2828781fe3a37b88cc1b99..a92aef805a0434f2ebcbc62d4e5eaef0cfb21bfa 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -135,7 +135,7 @@ class SequencePoolFunctor { const std::string pooltype, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index = nullptr) { - auto lod = input.lod()[0]; + auto& lod = input.lod()[0]; const size_t item_dim = output->numel() / output->dims()[0]; dim3 threads(1024, 1); dim3 grid(lod.size(), 1); @@ -297,7 +297,7 @@ class SequencePoolGradFunctor { framework::LoDTensor* in_grad, /* max pool has index */ const framework::Tensor* index = nullptr) { - auto lod = in_grad->lod()[0]; + auto& lod = in_grad->lod()[0]; const size_t item_dim = in_grad->numel() / in_grad->dims()[0]; dim3 threads(1024, 1); dim3 grid(lod.size(), 1); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 7182149164854038bb67a9f06cdbec8a4a0f1fb2..242a1b9ae92ade0caf1b0f1fcb5458b8b7070d84 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -59,7 +59,8 @@ class MatMulKernel : public framework::OpKernel { RowMatrixFromVector(x.dims()), 0, context.Attr("transpose_X")); auto mat_dim_b = math::CreateMatrixDescriptor( ColumnMatrixFromVector(y.dims()), 0, context.Attr("transpose_Y")); - blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0)); + auto scale = static_cast(context.Attr("alpha")); + blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); } }; @@ -185,7 +186,8 @@ class MatMulGradKernel : public framework::OpKernel { auto blas = math::GetBlas(context); auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); - blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0)); + blas.MatMul(a, mat_dim_a, b, mat_dim_b, + static_cast(context.Attr("alpha")), out, T(0)); } void CalcInputGrad(const framework::ExecutionContext &context, @@ -334,6 +336,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { R"DOC(If true, use the transpose of `Y`. )DOC") .SetDefault(false); + AddAttr("alpha", "The scale of Out").SetDefault(1.0f); AddComment(R"DOC( MatMul Operator. diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 2a8e4af516ce9341772d4668dc993215b4aae24d..363abfb0e0c96e8a4d82124dff168f28e339a9ae 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -156,12 +156,29 @@ class MulGradOp : public framework::OperatorWithKernel { } }; +class MulOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr retv(new framework::OpDesc()); + retv->SetType("mul_grad"); + retv->SetInput("X", Input("X")); + retv->SetInput("Y", Input("Y")); + retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + retv->SetOutput(framework::GradVarName("X"), InputGrad("X")); + retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + retv->SetAttrMap(Attrs()); + return retv; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker); REGISTER_OPERATOR(mul_grad, ops::MulGradOp); REGISTER_OP_CPU_KERNEL( mul, ops::MulKernel, diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index c614de2eac143b3a545c60226aefa93dd72dea4f..13be6c65be58314a75124106eb09b1300305baf0 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -52,6 +52,12 @@ $$Out = scale*X$$ )DOC"); AddAttr("scale", "The scaling factor of the scale operator.") .SetDefault(1.0); + AddAttr("bias", "The bias of the scale operator.").SetDefault(0.0); + AddAttr( + "bias_after_scale", + "Apply bias addition after or before scaling. It is useful for " + "numeric stability in some circumstances.") + .SetDefault(true); } }; @@ -80,6 +86,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { grad_op->SetInput("X", OutputGrad("Out")); grad_op->SetOutput("Out", InputGrad("X")); grad_op->SetAttr("scale", GetAttr("scale")); + grad_op->SetAttr("bias", 0.0f); + grad_op->SetAttr("bias_after_scale", true); return std::unique_ptr(grad_op); } }; diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index fe035aba81dd74d21539974beed255275be3013b..d8a199bc2b860515645b4954b49d8eb59fbd02dc 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -34,6 +34,8 @@ class ScaleKernel : public framework::OpKernel { "in and out should have the same dim"); auto scale = static_cast(ctx.Attr("scale")); + auto bias = static_cast(ctx.Attr("bias")); + auto bias_after_scale = ctx.Attr("bias_after_scale"); if (in_var->IsType() && in_var != out_var) { auto& in_slr = in_var->Get(); @@ -45,7 +47,11 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); auto& dev = *ctx.template device_context().eigen_device(); - eigen_out.device(dev) = scale * eigen_in; + if (bias_after_scale) { + eigen_out.device(dev) = scale * eigen_in + bias; + } else { + eigen_out.device(dev) = scale * (eigen_in + bias); + } } }; diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 077b9a5f7d935a39706ef3c2b710522bf1b713ed..397a3182953e3f1afaeadeff6d53a4f22fb95d26 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -1,136 +1,100 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/sequence_concat_op.h" +#include namespace paddle { namespace operators { -class SequenceConcatOp : public framework::OperatorWithKernel { +class SeqConcatOpMaker : public framework::OpProtoAndCheckerMaker { public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInputs("X"), - "Inputs(X) of SequenceConcatOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequenceConcatOp should not be null."); - const size_t level = static_cast(ctx->Attrs().Get("level")); - const size_t axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE(level == 0UL || level == 1UL, - "The sequence_concat operator only accepts sequence " - "or a nested sequence as its input."); - auto ins_dims = ctx->GetInputsDim("X"); - framework::DDim out_dims = ins_dims[0]; - const size_t n = ins_dims.size(); - for (size_t i = 1; i < n; ++i) { - out_dims[axis] += ins_dims[i][axis]; - } - ctx->SetOutputDim("Out", out_dims); + void Make() override { + AddInput("X", "The inputs of sequence concat op").AsDuplicable(); + AddOutput("Out", "The output of sequence concat op"); + AddComment( + "Sequence Concat Op\n" + "It will concat LoD tensors by its sequence information.\n" + "For example:\n" + " LoD of X1 = [0, 3, 7]\n" + " LoD of X2 = [0, 7, 9]\n" + " Result LoD is [0, (3+7), (7+9)]\n" + " i.e.[0, 10, 16]\n"); } }; -class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { +class SeqConcatShapeInferer : public framework::InferShapeBase { public: - void Make() override { - AddInput("X", - "(LodTensorArray) Input is a vector of LoDTensor, " - "each of which is a variable-length sequence or nested sequence.") - .AsDuplicable(); - AddOutput("Out", - "(LoDTensor), Variable-length output of " - "sequence_concat Op."); - AddAttr("axis", - "(int, default 0) " - "The axis along which the inputs will be joined. " - "If axis is 0, the inputs will be joined with LoD index.") - .SetDefault(0); - AddAttr("level", - "(int, default 0) " - "The level at which the inputs will be joined. " - "If the level is 0, the inputs will be joined at the nested " - "sequence level. " - "If the level is 1, the inputs will be joined at the " - "sequence level. " - "The level should be less than the level number of inputs.") - .SetDefault(0); - AddComment(R"DOC( -The sequence_concat operator concatenates multiple LoDTensors. -It only supports sequence (LoD Tensor with level number is 1) -or a nested sequence (LoD tensor with level number is 2) as its input. -- Case1: - If the axis is other than 0(here, axis is 1 and level is 1), - each input should have the same LoD information and the LoD - information of the output keeps the same as the input. - - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4) - LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4) - -- Case2: - If the axis is 0(here, leve is 0), the inputs are concatenated along - time steps, the LoD information of the output need to re-compute. - The LoD information of level-1 should be same. - - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) - LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4) - -- Case3: - If the axis is 0(here, level is 1). - - LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4) - LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4) - -- Case4: - If the LoD number is 1, axis is 0, level is 0 - - LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4) - LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4) - LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4) - -NOTE: The levels of all the inputs should be the same. - )DOC"); + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInputs("X"), + "Input(X) of Sequence Concat Op should not be null."); + PADDLE_ENFORCE(context->HasOutput("Out"), + "Output(Out) of Sequence Concat Op should not be null."); + + PADDLE_ENFORCE_GT(context->Inputs("X").size(), 1, + "The number of input sequences is at least two."); + auto x_dims = context->GetInputsDim("X"); + int64_t batch_size = 0; + int64_t feature_size = 0; + std::vector out_dims; + for (auto &x_dim : x_dims) { + if (out_dims.empty()) { + out_dims = framework::vectorize(x_dim); + } + batch_size += x_dim[0]; + if (feature_size == 0) { + feature_size = framework::product(x_dim) / x_dim[0]; + } else { + PADDLE_ENFORCE_EQ( + feature_size, framework::product(x_dim) / x_dim[0], + "Inputs of sequence concat must have same feature size"); + } + } + if (batch_size < 0) { + batch_size = -1; // Normalize batch size for compile time. + } + out_dims[0] = batch_size; + context->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (!context->IsRuntime()) { // Runtime LoD infershape will be computed + // in Kernel. + context->ShareLoD("X", "Out"); + } } }; -class SequenceConcatGradOp : public framework::OperatorWithKernel { +class SeqConcatGradShapeInferer : public framework::InferShapeBase { public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "The gradient of Out should not be null."); - PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), - "The gradient of X should not be null."); - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputsDim(framework::GradVarName("X"), + context->GetInputsDim("X")); } }; - } // namespace operators } // namespace paddle -namespace ops = paddle::operators; -REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp, - ops::SequenceConcatOpMaker, - paddle::framework::DefaultGradOpDescMaker< - false> /* set false to disable empty grad */); -REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp); -REGISTER_OP_CPU_KERNEL( - sequence_concat, - ops::SequenceConcatOpKernel); -REGISTER_OP_CPU_KERNEL( - sequence_concat_grad, - ops::SequenceConcatGradOpKernel); +namespace op = paddle::operators; + +REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, + op::SeqConcatOpMaker, op::SeqConcatShapeInferer, + paddle::framework::DefaultGradOpDescMaker); +template +using Kernel = op::SeqConcatKernel; +REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel); +REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, + op::SeqConcatGradShapeInferer); +template +using GradKernel = + op::SeqConcatGradKernel; +REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, + GradKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_concat_op.cu.cc index 43860b7c51712bf481d54c4f157ee608678c6940..eb6535235df80a9267b22403ae1f35c6cefb7fe7 100644 --- a/paddle/fluid/operators/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_concat_op.cu.cc @@ -1,23 +1,26 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/sequence_concat_op.h" -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_concat, - ops::SequenceConcatOpKernel); -REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, - ops::SequenceConcatGradOpKernel< - paddle::platform::CUDADeviceContext, float>); +template +using Kernel = + paddle::operators::SeqConcatKernel; +REGISTER_OP_CUDA_KERNEL(sequence_concat, Kernel, Kernel); +template +using GradKernel = + paddle::operators::SeqConcatGradKernel; +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, GradKernel, + GradKernel); diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h index 71c9f45287c29628a2f2c8c649e9e5270317ef6a..33e9babff274af888b84d33c991cc0a5b70333ae 100644 --- a/paddle/fluid/operators/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_concat_op.h @@ -1,171 +1,130 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once + #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/concat.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using LoD = framework::LoD; - -template -LoD ConcatLoD(const std::vector ins, const size_t level) { - auto out_lod = ins[0]->lod(); - auto numLevels = ins[0]->NumLevels(); - const size_t n = ins.size(); - const size_t level_idx = ins[0]->NumLevels() - 1 - level; - for (size_t i = 1; i < n; ++i) { - for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) { - out_lod[level_idx][j] += ins[i]->lod()[level_idx][j]; +namespace detail { +template +inline framework::LoD ConcatLoD(const Container &xs, + std::vector *xs_in_order) { + std::vector result; + result.resize(xs[0].get().lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto &x_lod = xs[j].get().lod()[0]; + const framework::Tensor &tensor = xs[j].get(); + xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i])); + sum += x_lod[i]; } + result[i] = sum; } - - for (size_t i = level_idx; i < numLevels - 1; ++i) { - size_t lod_len = 1; - for (size_t j = 0; j < n; ++j) { - lod_len += ins[j]->lod()[i + 1].size() - 1; - } - out_lod[i + 1].clear(); - out_lod[i + 1].resize(lod_len); - - size_t idx = 1; - for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) { - for (size_t k = 0; k < n; ++k) { - for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) { - out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] + - ins[k]->lod()[i + 1][m + 1] - - ins[k]->lod()[i + 1][m]; - idx++; - } - } - } - } - - return out_lod; + framework::LoD lod; + lod.emplace_back(result); + return lod; } +} // namespace detail template -class SequenceConcatOpKernel : public framework::OpKernel { +class SeqConcatKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - const size_t axis = static_cast(ctx.Attr("axis")); - const size_t level = static_cast(ctx.Attr("level")); - const size_t n = ins.size(); - - for (size_t i = 1; i < n; ++i) { - PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(), - "The levels of all the input LoDTensors " - "should be the same."); - PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(), - "The dimension size of all the input LoDTensors " - "should be the same."); - - const size_t dims_size = ins[i]->dims().size(); - for (size_t j = 0; j < dims_size; ++j) { - if (j == axis) continue; - PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j], - "Except for the dimension of the specified " - "axis along which all the inputs are concatenated, " - "dimensions of all the other axises of the input " - "LoDTensors should be the same."); - } - } - PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level, - "The levels of all the input LoDTensors " - "should be greater than the specify level"); - - out->mutable_data(ctx.GetPlace()); - auto out_lod = ins[0]->lod(); - if (axis == 0) { - out_lod = ConcatLoD(ins, level); - } - out->set_lod(out_lod); - - const size_t level_idx = out_lod.size() - level - 1; - auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; - for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { - Tensor out_t = out->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); - auto out_stride = framework::stride(out_t.dims()); - size_t offset = 0; - for (size_t j = 0; j < n; ++j) { - auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx]; - auto in_stride = framework::stride(ins[j]->dims()); - Tensor in_t = ins[j]->Slice(static_cast(in_lod_level[i]), - static_cast(in_lod_level[i + 1])); - size_t axis_dim = in_t.dims()[axis]; - StridedMemcpy(ctx.device_context(), in_t.data(), in_stride, - in_t.dims(), out_stride, out_t.data() + offset); - offset += axis_dim * in_stride[axis]; + void Compute(const framework::ExecutionContext &context) const override { + auto xs = detail::VectorRef(context.MultiInput("X"), + "Cannot find multiple input X"); + auto &out = detail::Ref(context.Output("Out"), + "Cannot find output"); + + size_t lod_size = 0; + for (auto &x : xs) { + if (lod_size == 0) { + lod_size = x.get().lod()[0].size(); + } else { + PADDLE_ENFORCE_EQ( + lod_size, x.get().lod()[0].size(), + "The number of sequence must be same between each input"); } } + PADDLE_ENFORCE_NE(lod_size, 0, "Each input must have sequence information"); + + std::vector x_in_order; + out.set_lod(detail::ConcatLoD(xs, &x_in_order)); + out.mutable_data(context.GetPlace()); + math::ConcatFunctor functor; + functor(context.template device_context(), x_in_order, 0, + &out); } }; template -class SequenceConcatGradOpKernel : public framework::OpKernel { +class SeqConcatGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto x_grads = - ctx.MultiOutput(framework::GradVarName("X")); - size_t axis = static_cast(ctx.Attr("axis")); - size_t level = static_cast(ctx.Attr("level")); - const size_t n = x_grads.size(); - - // Set Grad(X) LoD as X - for (size_t i = 0; i < n; i++) { - x_grads[i]->set_lod(ins[i]->lod()); - x_grads[i]->mutable_data(ctx.GetPlace()); + void Compute(const framework::ExecutionContext &context) const override { + auto xs = context.MultiInput("X"); + auto dxs = + context.MultiOutput(framework::GradVarName("X")); + PADDLE_ENFORCE_EQ(xs.size(), dxs.size()); + for (size_t i = 0; i < dxs.size(); ++i) { + if (dxs[i] != nullptr) { + dxs[i]->set_lod(xs[i]->lod()); + dxs[i]->mutable_data(context.GetPlace()); + } } - auto out_lod = ins[0]->lod(); - if (axis == 0UL) { - out_lod = ConcatLoD(ins, level); + std::vector sliced_x; + std::vector> sliced_dx; + + for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) { + for (size_t j = 0; j < xs.size(); ++j) { + const framework::LoDTensor *x = xs[j]; + framework::LoDTensor *dx = dxs[j]; + auto &x_lod = x->lod()[0]; + sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i])); + if (dx != nullptr) { + sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i])); + } else { + sliced_dx.emplace_back(boost::blank()); + } + } } - const size_t level_idx = out_lod.size() - level - 1; - auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx]; - for (size_t i = 0; i < out_lod_level.size() - 1; ++i) { - Tensor out_grad_t = - out_grad->Slice(static_cast(out_lod_level[i]), - static_cast(out_lod_level[i + 1])); - auto out_grad_stride = framework::stride(out_grad_t.dims()); - size_t offset = 0; + math::ConcatGradFunctor functor; + std::vector sliced_x_ptr; + std::vector sliced_dx_ptr; + for (auto &x : sliced_x) { + sliced_x_ptr.emplace_back(&x); + } - for (size_t j = 0; j < n; ++j) { - auto x_grad_lod_level = - framework::ToAbsOffset(x_grads[j]->lod())[level_idx]; - auto x_grad_stride = framework::stride(x_grads[j]->dims()); - Tensor x_grad_t = - x_grads[j]->Slice(static_cast(x_grad_lod_level[i]), - static_cast(x_grad_lod_level[i + 1])); - size_t axis_dim = x_grad_t.dims()[axis]; - StridedMemcpy(ctx.device_context(), out_grad_t.data() + offset, - out_grad_stride, out_grad_t.dims(), x_grad_stride, - x_grad_t.data()); - offset += axis_dim * out_grad_stride[axis]; + for (auto &dx : sliced_dx) { + try { + sliced_dx_ptr.emplace_back(&boost::get(dx)); + } catch (boost::bad_get &) { + sliced_dx_ptr.emplace_back(nullptr); } } + functor(context.template device_context(), + detail::Ref( + context.Input(framework::GradVarName("Out")), + "Sequence Concat OG must be set"), + sliced_x_ptr, 0, &sliced_dx_ptr); } }; diff --git a/paddle/fluid/operators/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_expand_as_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..33c1e1c973c80ba3943924331380d35b225ac800 --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_as_op.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_expand_as_op.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; + +class SequenceExpandAsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceExpandAsOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of SequenceExpandAsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceExpandAsOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = x_dims; + + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Dimension number of Input(X) should be at least 2."); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + + auto& x_dim = x_var->Get().dims(); + auto& y_lod = y_var->Get().lod(); + + PADDLE_ENFORCE_EQ(y_lod.size(), 1, + "Level number of Input(Y)'s lod should be 1."); + + PADDLE_ENFORCE_EQ(static_cast(x_dim[0]), y_lod[0].size() - 1, + "The first dimension of Input(X) should be equal " + "to the size of Input(Y)'s 0 level lod."); + + int64_t out_first_dim = 0; + if (y_lod[0].size() <= 1) { + out_first_dim = x_dims[0]; + } else { + for (size_t i = 1; i < y_lod[0].size(); ++i) { + out_first_dim += (y_lod[0][i] - y_lod[0][i - 1]); + } + } + out_dims[0] = out_first_dim; + } else { + out_dims[0] = -1; + } + + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("Y", /*->*/ "Out"); + } +}; + +class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) A 2-D LoDTensor whose lod " + "level is at most 1."); + AddInput("Y", + "(LoDTensor, default LoDTensor) Referred LoDTensor whose " + "lod (specified level) is referred by Input(X)."); + AddOutput("Out", + "(LodTensor, default LoDTensor) Output LoDTensor which is " + "generated from Input(X) by referring lod of Input(Y)."); + AddComment(R"DOC( +Sequence Expand As Operator. + +This operator expands `X` according to the zeroth level lod of `Y`. Current +implementation requires the level number of Input(Y)'s lod should be 1, and +the first dimension of Input(X) should be equal to the size of Input(Y)'s zeroth +level lod, and lod of Input(X) is not considered. + +Following are cases to better explain how this works: + +Case 1: + +Given a 1-level LoDTensor input(X) + X.data = [[a], [b], [c], [d]] + X.dims = [4, 1] +and input(Y) + Y.lod = [[0, 3, 6, 7, 8]] +ref_level: 0 +then we get 1-level LoDTensor + Out.lod = [[0, 3, 6, 7, 8]] + Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]] + Out.dims = [8, 1] + +Case 2: + +Given a common Tensor input(X) + X.data = [[a, b], [c, d], [e, f]] + X.dims = [3, 2] +and input(Y) + Y.lod = [[0, 2, 3, 6]] +ref_level: 0 +then we get a common LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]] + Out.dims = [6, 2] + +)DOC"); + } +}; + +class SequenceExpandAsOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + ctx->ShareLoD("X", x_grad_name); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp, + ops::SequenceExpandAsOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad); +REGISTER_OP_CPU_KERNEL( + sequence_expand_as, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel); +REGISTER_OP_CPU_KERNEL( + sequence_expand_as_grad, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_expand_as_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7357f5ae6e732f28307af65d1f1b6b3cbed1f640 --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_as_op.cu @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/sequence_expand_as_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +static __global__ void sequence_expand_as_kernel(const T *in_data, + const size_t *expand_offset, + const size_t src_hight, + const size_t src_widht, + T *out_data) { + for (int h_id = blockIdx.x; h_id < src_hight; h_id += gridDim.x) { + int span = expand_offset[h_id + 1] - expand_offset[h_id]; + if (span == 0) continue; + const T *src = in_data + h_id * src_widht; + for (int w_id = threadIdx.x; w_id < src_widht; w_id += blockDim.x) { + T ele = src[w_id]; + int offset = expand_offset[h_id] * src_widht; + for (int k = 0; k < span; ++k) { + out_data[offset + k * src_widht + w_id] = ele; + } + } + } +} + +template +static __global__ void sequence_expand_as_grad_kernel( + const T *dout_data, const size_t *expand_offset, const size_t dst_hight, + const size_t dst_width, T *dx_data) { + for (int h_id = blockIdx.x; h_id < dst_hight; h_id += gridDim.x) { + T *dst = dx_data + h_id * dst_width; + int span = expand_offset[h_id + 1] - expand_offset[h_id]; + + for (int w_id = threadIdx.x; w_id < dst_width; w_id += blockDim.x) { + T result = 0; + for (int k = 0; k < span; ++k) { + int offset = (expand_offset[h_id] + k) * dst_width; + const T *src = dout_data + offset; + result += src[w_id]; + } + dst[w_id] = result; + } + } +} + +template +struct SequenceExpandFunctor { + void operator()( + const platform::CUDADeviceContext &context, const LoDTensor &x, + const framework::Vector &ref_lod, /*expand referenced lod*/ + LoDTensor *out) { + int hight = x.dims()[0]; + int width = framework::product(x.dims()) / hight; + + const int kThreadsPerBlock = 1024; + int thread_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) { // block_cols is aligned by 32. + thread_x = ((width + 31) >> 5) << 5; + } + + int max_threads = context.GetMaxPhysicalThreadCount(); + int block_x = std::max(max_threads / thread_x, 1); + + dim3 block_size(thread_x); + dim3 grid_size(block_x); + sequence_expand_as_kernel<<>>( + x.data(), ref_lod.CUDAData(context.GetPlace()), hight, width, + out->mutable_data(context.GetPlace())); + } +}; + +template +struct SequenceExpandAsGradFunctor { + void operator()(const platform::CUDADeviceContext &context, + const LoDTensor &dout, + const framework::Vector &ref_lod, /*expand based lod*/ + LoDTensor *dx) { + int hight = dx->dims()[0]; + int width = framework::product(dx->dims()) / hight; + + const int kThreadsPerBlock = 1024; + int thread_x = kThreadsPerBlock; + if (width < kThreadsPerBlock) { // block_cols is aligned by 32. + thread_x = ((width + 31) >> 5) << 5; + } + + int max_threads = context.GetMaxPhysicalThreadCount(); + int block_x = std::max(max_threads / thread_x, 1); + + dim3 block_size(thread_x); + dim3 grid_size(block_x); + sequence_expand_as_grad_kernel<<>>( + dout.data(), ref_lod.CUDAData(context.GetPlace()), hight, width, + dx->mutable_data(context.GetPlace())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_expand_as, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_expand_as_grad, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel); diff --git a/paddle/fluid/operators/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_expand_as_op.h new file mode 100644 index 0000000000000000000000000000000000000000..42c90d01c05e369efc276498aa94debb367a6bfa --- /dev/null +++ b/paddle/fluid/operators/sequence_expand_as_op.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // std::iota +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +struct SequenceExpandFunctor { + void operator()( + const DeviceContext &ctx, const framework::LoDTensor &x, + const framework::Vector &ref_lod, /*expand referenced lod*/ + framework::LoDTensor *out); +}; + +template +struct SequenceExpandAsGradFunctor { + void operator()( + const DeviceContext &ctx, const framework::LoDTensor &dout, + const framework::Vector &ref_lod, /*expand referenced lod*/ + framework::LoDTensor *dx); +}; + +template +struct SequenceExpandFunctor { + void operator()( + const platform::CPUDeviceContext &context, const framework::LoDTensor &x, + const framework::Vector &ref_lod, /*expand referenced lod*/ + framework::LoDTensor *out) { + int64_t hight = x.dims()[0]; + int64_t width = framework::product(x.dims()) / hight; + + const T *in_data = x.data(); + T *out_data = out->mutable_data(context.GetPlace()); + + for (int h_id = 0; h_id < hight; ++h_id) { + size_t span = ref_lod[h_id + 1] - ref_lod[h_id]; + if (span == 0) continue; + const T *src = in_data + h_id * width; + for (int64_t w_id = 0; w_id < width; ++w_id) { + T ele = src[w_id]; + size_t offset = ref_lod[h_id] * width; + for (size_t k = 0; k < span; ++k) { + out_data[offset + k * width + w_id] = ele; + } + } + } + } +}; + +template +class SequenceExpandAsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *y = context.Input("Y"); + auto *out = context.Output("Out"); + + auto &y_lod = y->lod(); + PADDLE_ENFORCE_EQ(y_lod.size(), 1, "LoD of Y should be 1."); + PADDLE_ENFORCE_GT(y_lod[0].size(), 1, "."); + + out->mutable_data(context.GetPlace()); + + auto &dev_ctx = context.template device_context(); + SequenceExpandFunctor seq_espand_functor; + seq_espand_functor(dev_ctx, *x, y_lod[0], out); + } +}; + +/* + *Given Grad(Out) + * + * Grad(Out).lod = [[0, 3, 6]] + * Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] + * Then + * Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)] + * = [0.6, 1.5] + * Grad(X).lod = Input(X).lod + * + * */ +template +struct SequenceExpandAsGradFunctor { + void operator()( + const platform::CPUDeviceContext &context, + const framework::LoDTensor &dout, + const framework::Vector &ref_lod, /*expand referenced lod*/ + framework::LoDTensor *dx) { + int64_t hight = dx->dims()[0]; + int64_t width = framework::product(dx->dims()) / hight; + + const T *dout_data = dout.data(); + T *dx_data = dx->mutable_data(context.GetPlace()); + + for (int64_t h_id = 0; h_id < hight; ++h_id) { + T *dst = dx_data + h_id * width; + size_t span = ref_lod[h_id + 1] - ref_lod[h_id]; + for (int64_t w_id = 0; w_id < width; ++w_id) { + T result = 0; + for (size_t k = 0; k < span; ++k) { + size_t offset = (ref_lod[h_id] + k) * width; + result += dout_data[offset + w_id]; + } + dst[w_id] = result; + } + } + } +}; + +template +class SequenceExpandAsGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *g_out = + context.Input(framework::GradVarName("Out")); + auto *y = context.Input("Y"); + auto *g_x = + context.Output(framework::GradVarName("X")); + + g_x->mutable_data(context.GetPlace()); + + SequenceExpandAsGradFunctor functor; + functor(context.template device_context(), *g_out, + y->lod()[0], g_x); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_mask_op.cc b/paddle/fluid/operators/sequence_mask_op.cc index e45c18d6aff65ecac565ef05e36b2d47ad8744b8..798211f481659eb71248f7a6210e6522273d387f 100644 --- a/paddle/fluid/operators/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_mask_op.cc @@ -23,4 +23,8 @@ REGISTER_OP_CPU_KERNEL( paddle::operators::SequenceMaskKernel, paddle::operators::SequenceMaskKernel); + int64_t>, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_mask_op.cu b/paddle/fluid/operators/sequence_mask_op.cu index ff5acf4d9edd5f0f15cbcb22eae212c2d49ccaab..2ad23774579533b62b9189c1564ad7c7db5c298a 100644 --- a/paddle/fluid/operators/sequence_mask_op.cu +++ b/paddle/fluid/operators/sequence_mask_op.cu @@ -19,4 +19,8 @@ REGISTER_OP_CUDA_KERNEL( paddle::operators::SequenceMaskKernel, paddle::operators::SequenceMaskKernel); + int64_t>, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_pad_op.cc b/paddle/fluid/operators/sequence_pad_op.cc index 44d73aa4076abfe15c906478702ac7c4a55303d4..4583b26256ba2e084bf7477c54d468df860d9b43 100644 --- a/paddle/fluid/operators/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_pad_op.cc @@ -29,10 +29,12 @@ class SequencePadOp : public framework::OperatorWithKernel { "Input(PadValue) of SequencePadOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of SequencePadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Length"), + "Output(Length) of SequencePadOp should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_GE(x_dims.size(), 2, - "The rank of Input(x) can't be less than 2."); + "The rank of Input(X) can't be less than 2."); auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size()); auto pad_value_dims = ctx->GetInputDim("PadValue"); PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) || @@ -41,8 +43,8 @@ class SequencePadOp : public framework::OperatorWithKernel { "shape equals to time steps in sequences"); int out_dim_0 = -1; - int out_dim_1 = -1; + int padded_length = ctx->Attrs().Get("padded_length"); if (ctx->IsRuntime()) { // run time framework::Variable* x_var = @@ -58,7 +60,6 @@ class SequencePadOp : public framework::OperatorWithKernel { int seq_num = x_lod_0.size() - 1; int max_seq_len = math::MaximumSequenceLength(x_lod_0); - int padded_length = ctx->Attrs().Get("padded_length"); if (padded_length == -1) { padded_length = max_seq_len; } @@ -66,19 +67,30 @@ class SequencePadOp : public framework::OperatorWithKernel { "The Attr(padded_length) must be -1 or an int greater " "than the length of the longest original sequence."); out_dim_0 = seq_num; - out_dim_1 = padded_length; } else { // compile time + if (padded_length == -1) { + padded_length = 1; + } framework::VarDesc* x_desc = boost::get(ctx->GetInputVarPtrs("X")[0]); PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); } - std::vector out_dims_vec{out_dim_0, out_dim_1}; + std::vector out_dims_vec{out_dim_0, padded_length}; + std::vector len_dims_vec{out_dim_0, 1}; auto time_step_dims_vec = framework::vectorize2int(time_step_dims); out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end()); ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("Length", framework::make_ddim(len_dims_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); } }; @@ -96,6 +108,10 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput( "Out", "(LoDTensor) The output vairable, which contains padded sequences."); + AddOutput( + "Length", + "(LoDTensor) The output vairable, which contains the actual length of " + "sequences before padding."); AddAttr( "padded_length", "The length of padded sequences. It can be setted to -1 or " @@ -125,6 +141,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[a, b, 0, 0], [c, d, e, 0]] + Length.data = [[2], [3]] Case 2: @@ -138,7 +155,8 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[[a1, a2], [b1, b2], [0, 0]], [[c1, c2], [d1, d2], [e1, e2]]] - + Length.data = [[2], [3]] + Case 3: Given a 1-level LoDTensor input(X): @@ -151,6 +169,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[[a1, a2], [b1, b2], [p1, p2]], [[c1, c2], [d1, d2], [e1, e2]]] + Length.data = [[2], [3]] )DOC"); } @@ -171,6 +190,13 @@ class SequencePadGradOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/sequence_pad_op.h b/paddle/fluid/operators/sequence_pad_op.h index 5fc9da69d787ff3aeffa716689d44772ad8f7bd2..840bd39a7f3eaca6cb03bca59016fc032e9a3068 100644 --- a/paddle/fluid/operators/sequence_pad_op.h +++ b/paddle/fluid/operators/sequence_pad_op.h @@ -32,6 +32,7 @@ class SequencePadOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); + auto* len_t = ctx.Output("Length"); out->mutable_data(ctx.GetPlace()); const auto* pad_value = ctx.Input("PadValue"); @@ -41,6 +42,15 @@ class SequencePadOpKernel : public framework::OpKernel { math::PaddingLoDTensorFunctor()( ctx.template device_context(), *x, out, *pad_value, padded_length, 0, false, math::kBatchLengthWidth); + + LoDTensor seq_len; + seq_len.Resize(len_t->dims()); + int64_t* len_data = seq_len.mutable_data(platform::CPUPlace()); + for (size_t i = 1; i < x->lod()[0].size(); ++i) { + len_data[i - 1] = x->lod()[0][i] - x->lod()[0][i - 1]; + } + framework::TensorCopy(seq_len, ctx.GetPlace(), + ctx.template device_context(), len_t); } }; diff --git a/paddle/fluid/operators/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_scatter_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..adb81bffccb50069b3a2e5f391f3fdfde231b2be --- /dev/null +++ b/paddle/fluid/operators/sequence_scatter_op.cc @@ -0,0 +1,156 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_scatter_op.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class SequenceScatterOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The source input of sequence scatter op"); + AddInput("Ids", + "(LoDTensor) The index input of sequence scatter op where X" + " will be updated, must be a LoDTensor"); + AddInput("Updates", + "(LoDTensor) The values to scatter to the input tensor " + "X, must be a LoDTensor with the same LoD information as Ids"); + AddOutput("Out", + "(Tensor) The output tensor of sequence scatter op, which " + "has the same dims as X"); + AddComment(R"DOC( +Sequence Scatter Operator. + +This operator scatters the Updates tensor to the input X. It uses the LoD +information of Ids to select the rows to update, and use the values in Ids as +the columns to update in each row of X. + +Following are cases to better explain how this works: + +Example 1: +Given an all-ones Tensor input(X) + X.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] + X.dims = [3, 6] +a LoDTensor input(Ids) + Ids.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]] + Ids.lod = [[0, 3, 8, 12]] +and a Tensor input(Updates) + Updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]] + Updates.lod = [[ 0, 3, 8, 12]] +then we get an output Tensor + Out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.4, 1.3, 1.2, 1.1], + [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]] + Out.dims = X.dims = [3, 6] +)DOC"); + } +}; + +class SequenceScatterOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // Enforce has inputs and outputs + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequenceScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of SequenceScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Updates"), + "Input(Updates) of SequenceScatterOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequenceScatterOp should not be null."); + + // Set output dim the same as input + auto ref_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", ref_dims); + + // Enforce the Updates and Ids are the same shape + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], + ctx->GetInputDim("Ids")[0], + "Updates and Ids should have same shape."); + + // Enforce LoD of ids and updates be the same + if (ctx->IsRuntime()) { + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + framework::Variable* updates_var = + boost::get(ctx->GetInputVarPtrs("Updates")[0]); + + auto& ids_lod = ids_var->Get().lod(); + auto& updates_lod = updates_var->Get().lod(); + PADDLE_ENFORCE_EQ(ids_lod.size(), 1, + "Currently only level 1 LoD could be" + " processed by sequence scatter op."); + PADDLE_ENFORCE_EQ(updates_lod.size(), 1, + "Currently only level 1 LoD " + "could be processed by sequence scatter op."); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class SequenceScatterGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Updates"), + ctx->GetInputDim("Updates")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp, + ops::SequenceScatterOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp); +REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel, + ops::SequenceScatterOpKernel, + ops::SequenceScatterOpKernel, + ops::SequenceScatterOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_scatter_grad, + ops::SequenceScatterGradientOpKernel, + ops::SequenceScatterGradientOpKernel, + ops::SequenceScatterGradientOpKernel, + ops::SequenceScatterGradientOpKernel); diff --git a/paddle/fluid/operators/sequence_scatter_op.h b/paddle/fluid/operators/sequence_scatter_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d9b681b7aa76849a40d50e3348418d7604641c10 --- /dev/null +++ b/paddle/fluid/operators/sequence_scatter_op.h @@ -0,0 +1,122 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceScatterOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* ids = ctx.Input("Ids"); + auto* updates = ctx.Input("Updates"); + auto* out = ctx.Output("Out"); + + auto& ids_lod = ids->lod(); + + // Initialize out as same as x + out->mutable_data(ctx.GetPlace()); + framework::TensorCopySync(*x, ctx.GetPlace(), out); + + auto x_dims = x->dims(); + auto out_dims = out->dims(); + + for (int i = 0; i < x_dims.size(); ++i) + PADDLE_ENFORCE(x_dims[i] == out_dims[i], + "Input and output shape of " + "sequence scatter op must exactly be the same."); + + size_t slice_size = 1; + for (int i = 1; i < x_dims.size(); ++i) slice_size *= x_dims[i]; + + auto lod_vec = ids_lod[0]; + unsigned int seg = 0; + for (int i = 0; i < ids->dims()[0]; ++i) { + PADDLE_ENFORCE_LT(seg, lod_vec.size() - 1, + "Segment num must not exceed batch size.\n"); + int lower_bound = lod_vec[seg]; + int upper_bound = lod_vec[seg + 1]; + if (i >= lower_bound && i < upper_bound) { + T* p_out = out->data(); + const T* p_updates = updates->data(); + const int64_t* p_index = ids->data(); + p_out[seg * slice_size + p_index[i]] += p_updates[i]; + } else { + ++seg; + --i; + } + } + } +}; + +template +class SequenceScatterGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto* ids = ctx.Input("Ids"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + + auto& ids_lod = ids->lod(); + + dX->mutable_data(ctx.GetPlace()); + framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); + dUpdates->mutable_data(ctx.GetPlace()); + + auto dx_dims = dX->dims(); + auto dout_dims = dOut->dims(); + + for (int i = 0; i < dx_dims.size(); ++i) + PADDLE_ENFORCE(dx_dims[i] == dout_dims[i], + "Input and output shape of " + "sequence scatter grad op must exactly be the same."); + + size_t slice_size = 1; + for (int i = 1; i < dx_dims.size(); ++i) slice_size *= dx_dims[i]; + + auto lod_vec = ids_lod[0]; + unsigned int seg = 0; + + for (int i = 0; i < ids->dims()[0]; ++i) { + PADDLE_ENFORCE_LT(seg, lod_vec.size() - 1, + "Segment num must not exceed batch size.\n"); + int lower_bound = lod_vec[seg]; + int upper_bound = lod_vec[seg + 1]; + if (i >= lower_bound && i < upper_bound) { + const T* p_dOut = dOut->data(); + const int64_t* p_index = ids->data(); + T* p_dUpdates = dUpdates->data(); + p_dUpdates[i] = p_dOut[seg * slice_size + p_index[i]]; + } else { + ++seg; + --i; + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc index 7aca9f7111956dba63e2ceee10077879fe092bdf..585363958696fa0d8ed1ffdc7b6fdaab26349b08 100644 --- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc @@ -29,8 +29,8 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); - auto lod = x->lod(); - auto dims = x->dims(); + auto& lod = x->lod(); + auto& dims = x->dims(); const size_t level = lod.size() - 1; PADDLE_ENFORCE_EQ(dims[0], static_cast(lod[level].back()), @@ -71,7 +71,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { if (x_grad) { x_grad->set_lod(x->lod()); } - auto lod = x->lod(); + auto& lod = x->lod(); const size_t level = lod.size() - 1; x_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_softmax_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e94ceaa170131e8bce7d1574b27f0baeaa8d1ffc --- /dev/null +++ b/paddle/fluid/operators/sequence_softmax_op.cu @@ -0,0 +1,171 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT +#include "paddle/fluid/operators/sequence_softmax_op.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +__device__ __forceinline__ float real_exp(float x) { return expf(x); } +__device__ __forceinline__ double real_exp(double x) { return exp(x); } + +template +using BlockReduce = cub::BlockReduce; + +template +using BlockReduceTempStorage = typename BlockReduce::TempStorage; + +template +__global__ void sequence_softmax_kernel(const T *in_data, const size_t *ref_lod, + const size_t src_hight, T *out_data) { + __shared__ BlockReduceTempStorage temp_storage; + __shared__ T shared_max_data; + __shared__ T shared_sum_data; + + for (int i = blockIdx.x; i < src_hight; i += gridDim.x) { + size_t start = ref_lod[i]; + size_t span = ref_lod[i + 1] - start; + + // Find the max ele + T max_ele = -FLT_MAX; + for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { + T ele = in_data[start + tid]; + max_ele = max_ele > ele ? max_ele : ele; + } + max_ele = + BlockReduce(temp_storage).Reduce(max_ele, cub::Max()); + if (threadIdx.x == 0) { + shared_max_data = max_ele; + } + __syncthreads(); + + // sum + T sum_data = 0; + for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { + T ele = in_data[start + tid]; + sum_data += real_exp(ele - shared_max_data); + } + sum_data = + BlockReduce(temp_storage).Reduce(sum_data, cub::Sum()); + if (threadIdx.x == 0) { + shared_sum_data = sum_data; + } + __syncthreads(); + + // get final resit + for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { + T ele = in_data[start + tid]; + ele = real_exp(ele - shared_max_data) / shared_sum_data; + out_data[start + tid] = ele; + } + } +} + +template +__global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data, + const T *softmax_data, + const size_t *ref_lod, + const size_t src_hight, + T *dx_data) { + __shared__ BlockReduceTempStorage temp_storage; + __shared__ T shared_data; + + for (int i = blockIdx.x; i < src_hight; i += gridDim.x) { + size_t start = ref_lod[i]; + size_t span = ref_lod[i + 1] - start; + + T result = 0; + for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { + size_t idx = start + tid; + T s_g_d = softmax_grad_data[idx]; + T s_d = softmax_data[idx]; + result += s_g_d * s_d; + } + result = BlockReduce(temp_storage).Reduce(result, cub::Sum()); + if (threadIdx.x == 0) { + shared_data = result; + } + __syncthreads(); + + for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { + size_t idx = start + tid; + T s_g_d = softmax_grad_data[idx]; + T s_d = softmax_data[idx]; + dx_data[idx] = (s_g_d - shared_data) * s_d; + } + } +} + +template +struct SequenceSoftmaxFunctor { + void operator()(const platform::CUDADeviceContext &context, + const LoDTensor &x, + const framework::Vector &ref_lod, /*referenced lod*/ + LoDTensor *out) { + int hight = ref_lod.size() - 1; + + const int kThreadsPerBlock = 32; + int thread_x = kThreadsPerBlock; + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + dim3 block_size(thread_x); + dim3 grid_size(max_blocks); + sequence_softmax_kernel< + T, kThreadsPerBlock><<>>( + x.data(), ref_lod.CUDAData(context.GetPlace()), hight, + out->mutable_data(context.GetPlace())); + } +}; + +template +struct SequenceSoftmaxGradFunctor { + void operator()(const platform::CUDADeviceContext &context, + const LoDTensor &dout, const LoDTensor &out, + const framework::Vector &ref_lod, /*referenced lod*/ + LoDTensor *dx) { + size_t hight = ref_lod.size() - 1; + + const int kThreadsPerBlock = 32; + int thread_x = kThreadsPerBlock; + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + dim3 block_size(thread_x); + dim3 grid_size(max_blocks); + + sequence_softmax_grad_kernel< + T, kThreadsPerBlock><<>>( + dout.data(), out.data(), ref_lod.CUDAData(context.GetPlace()), + hight, dx->mutable_data(context.GetPlace())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_softmax, + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc deleted file mode 100644 index 397df75415691e4f53bc399cd1868c3e37bc9110..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sequence_softmax_op.cu.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sequence_softmax_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_softmax, - ops::SequenceSoftmaxKernel, - ops::SequenceSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel, - ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h index bca564e16f9951519eefe25126aadebb4c1326b6..ed49e9471458cbca2d4760d966ef30033f292778 100644 --- a/paddle/fluid/operators/sequence_softmax_op.h +++ b/paddle/fluid/operators/sequence_softmax_op.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/softmax.h" namespace paddle { namespace operators { @@ -23,12 +22,76 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; +template +struct SequenceSoftmaxFunctor { + void operator()( + const DeviceContext &ctx, const LoDTensor &x, + const framework::Vector &ref_lod, /*expand referenced lod*/ + LoDTensor *out); +}; + +template +struct SequenceSoftmaxGradFunctor { + void operator()(const DeviceContext &ctx, const LoDTensor &dout, + const LoDTensor &out, + const framework::Vector &ref_lod, /*referenced lod*/ + LoDTensor *dx); +}; + +template +struct SequenceSoftmaxFunctor { + void operator()(const platform::CPUDeviceContext &ctx, const LoDTensor &x, + const framework::Vector &ref_lod, /*referenced lod*/ + LoDTensor *out) { + size_t hight = ref_lod.size() - 1; + const T *in_data = x.data(); + T *out_data = out->mutable_data(ctx.GetPlace()); + for (size_t i = 0; i < hight; ++i) { + size_t span = ref_lod[i + 1] - ref_lod[i]; + T result = 0; + for (size_t j = 0; j < span; ++j) { + result += exp(in_data[ref_lod[i] + j]); + } + for (size_t j = 0; j < span; ++j) { + out_data[ref_lod[i] + j] = exp(in_data[ref_lod[i] + j]) / result; + } + } + } +}; + +template +struct SequenceSoftmaxGradFunctor { + void operator()(const platform::CPUDeviceContext &ctx, const LoDTensor &dout, + const LoDTensor &out, + const framework::Vector &ref_lod, /*referenced lod*/ + LoDTensor *dx) { + size_t hight = ref_lod.size() - 1; + + const T *softmax_grad_data = dout.data(); + const T *softmax = out.data(); + T *dx_data = dx->mutable_data(ctx.GetPlace()); + + for (size_t i = 0; i < hight; ++i) { + size_t span = ref_lod[i + 1] - ref_lod[i]; + T result = 0; + for (size_t j = 0; j < span; ++j) { + result += softmax_grad_data[ref_lod[i] + j] * softmax[ref_lod[i] + j]; + } + + for (size_t j = 0; j < span; ++j) { + dx_data[ref_lod[i] + j] = (softmax_grad_data[ref_lod[i] + j] - result) * + softmax[ref_lod[i] + j]; + } + } + } +}; + template class SequenceSoftmaxKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); auto lod = x->lod(); auto dims = x->dims(); @@ -42,55 +105,33 @@ class SequenceSoftmaxKernel : public framework::OpKernel { "SequenceSoftmaxOp should be 1."); out->mutable_data(ctx.GetPlace()); - for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - Tensor x_i = x->Slice(start_pos, end_pos); - Tensor out_i = out->Slice(start_pos, end_pos); - - // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) - framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); - x_i.Resize(dims_i); - out_i.Resize(dims_i); - math::SoftmaxFunctor()( - ctx.template device_context(), &x_i, &out_i); - } + + SequenceSoftmaxFunctor seq_softmax_functor; + seq_softmax_functor(ctx.template device_context(), *x, + lod[level], out); } }; template class SequenceSoftmaxGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - if (x_grad) { - x_grad->set_lod(x->lod()); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *out = ctx.Input("Out"); + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto *x = ctx.Input("X"); + auto *x_grad = ctx.Output(framework::GradVarName("X")); + if (!x_grad) { + return; } + x_grad->set_lod(x->lod()); auto lod = x->lod(); const size_t level = lod.size() - 1; - x_grad->mutable_data(ctx.GetPlace()); - for (int i = 0; i < static_cast(lod[level].size()) - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - - Tensor out_i = out->Slice(start_pos, end_pos); - Tensor out_grad_i = out_grad->Slice(start_pos, end_pos); - Tensor x_grad_i = x_grad->Slice(start_pos, end_pos); - - // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos) - framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); - out_i.Resize(dims_i); - out_grad_i.Resize(dims_i); - x_grad_i.Resize(dims_i); - math::SoftmaxGradFunctor()( - ctx.template device_context(), &out_i, &out_grad_i, - &x_grad_i); - } + + SequenceSoftmaxGradFunctor seq_softmax_grad_functor; + seq_softmax_grad_functor(ctx.template device_context(), + *out_grad, *out, lod[level], x_grad); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 6dffe527c1072ee97fcde1725bfc1a47ed1ad74a..2c4c2411259e9b7bfa223e4d65823ce4610596d0 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -123,7 +123,6 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); - // if all the input sparse vars are empty, no need to // merge these vars. if (first_dim == 0UL) { diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d854e2803975543b51c50ea2bc173322d3c3ca5e --- /dev/null +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e +template +T Erfinv(T x) { + if (x < -1 || x > 1) { + return std::numeric_limits::quiet_NaN(); + } else if (x == 1.0) { + return std::numeric_limits::infinity(); + } else if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + + const T LN2 = 6.931471805599453094172321214581e-1; + + const T A0 = 1.1975323115670912564578e0; + const T A1 = 4.7072688112383978012285e1; + const T A2 = 6.9706266534389598238465e2; + const T A3 = 4.8548868893843886794648e3; + const T A4 = 1.6235862515167575384252e4; + const T A5 = 2.3782041382114385731252e4; + const T A6 = 1.1819493347062294404278e4; + const T A7 = 8.8709406962545514830200e2; + + const T B0 = 1.0000000000000000000e0; + const T B1 = 4.2313330701600911252e1; + const T B2 = 6.8718700749205790830e2; + const T B3 = 5.3941960214247511077e3; + const T B4 = 2.1213794301586595867e4; + const T B5 = 3.9307895800092710610e4; + const T B6 = 2.8729085735721942674e4; + const T B7 = 5.2264952788528545610e3; + + const T C0 = 1.42343711074968357734e0; + const T C1 = 4.63033784615654529590e0; + const T C2 = 5.76949722146069140550e0; + const T C3 = 3.64784832476320460504e0; + const T C4 = 1.27045825245236838258e0; + const T C5 = 2.41780725177450611770e-1; + const T C6 = 2.27238449892691845833e-2; + const T C7 = 7.74545014278341407640e-4; + + const T D0 = 1.4142135623730950488016887e0; + const T D1 = 2.9036514445419946173133295e0; + const T D2 = 2.3707661626024532365971225e0; + const T D3 = 9.7547832001787427186894837e-1; + const T D4 = 2.0945065210512749128288442e-1; + const T D5 = 2.1494160384252876777097297e-2; + const T D6 = 7.7441459065157709165577218e-4; + const T D7 = 1.4859850019840355905497876e-9; + + const T E0 = 6.65790464350110377720e0; + const T E1 = 5.46378491116411436990e0; + const T E2 = 1.78482653991729133580e0; + const T E3 = 2.96560571828504891230e-1; + const T E4 = 2.65321895265761230930e-2; + const T E5 = 1.24266094738807843860e-3; + const T E6 = 2.71155556874348757815e-5; + const T E7 = 2.01033439929228813265e-7; + + const T F0 = 1.414213562373095048801689e0; + const T F1 = 8.482908416595164588112026e-1; + const T F2 = 1.936480946950659106176712e-1; + const T F3 = 2.103693768272068968719679e-2; + const T F4 = 1.112800997078859844711555e-3; + const T F5 = 2.611088405080593625138020e-5; + const T F6 = 2.010321207683943062279931e-7; + const T F7 = 2.891024605872965461538222e-15; + + T abs_x = abs(x); + + if (abs_x <= 0.85) { + T r = 0.180625 - 0.25 * x * x; + T num = + (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) * + r + + A0); + T den = + (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) * + r + + B0); + return x * num / den; + } + + T r = sqrt(LN2 - log(1.0 - abs_x)); + + T num, den; + if (r <= 5.0) { + r = r - 1.6; + num = + (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) * + r + + C0); + den = + (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) * + r + + D0); + } else { + r = r - 5.0; + num = + (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) * + r + + E0); + den = + (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) * + r + + F0); + } + + if (x < 0) { + return -num / den; + } else { + return num / den; + } +} + +template +struct TruncatedNormal { + T mean, std; + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + + T operator()(T value) const { + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std; + } +}; + +template +class CPUTruncatedGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + + unsigned int seed = static_cast(context.Attr("seed")); + std::minstd_rand engine; + if (seed == 0) { + seed = std::random_device()(); + } + engine.seed(seed); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); + TruncatedNormal truncated_normal(mean, std); + int64_t size = tensor->numel(); + for (int64_t i = 0; i < size; ++i) { + data[i] = truncated_normal(dist(engine)); + } + } +}; + +class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of TruncatedGaussianRandomOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + std::vector out_dim; + out_dim.reserve(shape.size()); + for (auto dim : shape) { + out_dim.push_back(static_cast(dim)); + } + PADDLE_ENFORCE(shape.size() > 0UL, + "shape can be one int or array. shape must be set."); + ctx->SetOutputDim("Out", framework::make_ddim(out_dim)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; + framework::DataLayout layout{framework::DataLayout::kAnyLayout}; + return framework::OpKernelType( + static_cast(ctx.Attr("dtype")), + ctx.device_context(), layout, library); + } +}; + +class TruncatedGaussianRandomOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "Output tensor of truncated gaussian random op."); + + AddAttr>("shape", + "(vector) " + "The dimension of random tensor."); + AddAttr("mean", + "(float, default 0.0) " + "mean of random tensor.") + .SetDefault(.0f); + AddAttr("std", + "(float, default 1.0) " + "std of random tensor.") + .SetDefault(1.0f); + AddAttr("seed", + "(int, default 0) " + "Random seed of generator." + "0 means use system wide seed." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time.") + .SetDefault(0); + AddAttr("dtype", + "(int, default 5(FP32)) " + "Output data type.") + .SetDefault(framework::proto::VarType::FP32); + AddComment(R"DOC( +TruncatedGaussianRandom Operator. + +Used to initialize tensors with truncated gaussian random generator. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, + ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker); +REGISTER_OP_CPU_KERNEL(truncated_gaussian_random, + ops::CPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ad2a9021bfe344d838dff2040b3fb9371274e218 --- /dev/null +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu @@ -0,0 +1,76 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +struct TruncatedNormal { + T mean, std; + T a_normal_cdf; + T b_normal_cdf; + unsigned int seed; + T numeric_min; + + __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed) + : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + } + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed); + thrust::uniform_real_distribution dist(numeric_min, 1); + rng.discard(n); + T value = dist(rng); + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std; + } +}; + +template +class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + T* data = tensor->mutable_data(context.GetPlace()); + unsigned int seed = static_cast(context.Attr("seed")); + if (seed == 0) { + std::random_device rd; + seed = rd(); + } + T mean = static_cast(context.Attr("mean")); + T std = static_cast(context.Attr("std")); + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + thrust::transform( + index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + TruncatedNormal(mean, std, std::numeric_limits::min(), seed)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL( + truncated_gaussian_random, + paddle::operators::GPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 791138a8c0eb3c477942a8b723206a8f8a3eac77..16eac1ec2406c147fa765bc014038ae03a1416b2 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -1,16 +1,16 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include #include "paddle/fluid/framework/executor.h" @@ -138,6 +138,10 @@ class WhileGradOp : public framework::OperatorBase { auto inside_og_name = inside_og_names[i]; VLOG(8) << "Linking outside " << outside_og_name << " --> inside " << inside_og_name; + if (scope.FindVar(outside_og_name) == nullptr) { + continue; + } + auto &og_outside = detail::Ref(scope.FindVar(outside_og_name), "Cannot find Outside Gradient %s", outside_og_name); @@ -167,20 +171,46 @@ class WhileGradOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); } } + } else { + PADDLE_THROW("Currently only support LoDTensor and LoDTensorArray."); } } executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true, true); - auto &pg_names = Outputs(kXGRAD); + // The Outputs(kXGRAD) contains the names of the gradient of parameters + // and inputs. + auto &pg_ig_names = Outputs(kXGRAD); auto &p_names = Inputs(kX); - PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); - for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { - if (pg_names[param_id] == framework::kEmptyVarName) { + PADDLE_ENFORCE_EQ(pg_ig_names.size(), p_names.size()); + for (size_t param_id = 0; param_id < pg_ig_names.size(); ++param_id) { + if (pg_ig_names[param_id] == framework::kEmptyVarName) { continue; // parameter doesn't have gradient } auto inside_grad_name = framework::GradVarName(p_names[param_id]); + // for some grad_op, their input doesn't have gradient, + // for example lookup_table_grad_op, the input(Idx) doesn't have + // gradient. + auto pg_ig_var = cur_scope.FindVar(inside_grad_name); + PADDLE_ENFORCE(pg_ig_var != nullptr); + if (pg_ig_var->IsType()) { + auto pg_ig_lod_t_arr = + pg_ig_var->GetMutable(); + bool empty = true; + for (auto &each : *pg_ig_lod_t_arr) { + if (each.numel() != 0) { + empty = false; + break; + } + } + if (empty) { + LOG(WARNING) << pg_ig_names[param_id] + << " is not found in cur_scope."; + continue; + } + } + // // TODO(tonyyang-svail): Not sure we need the following // // If does not compute gradient of that variable inside rnn, // just @@ -194,6 +224,11 @@ class WhileGradOp : public framework::OperatorBase { if (cur_scope_iter == step_scopes->rbegin()) { auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); + PADDLE_ENFORCE(var->IsType() || + var->IsType(), + "Currently the type of var only can be LoDTensorArray " + "or LoDTensor."); + if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; @@ -201,7 +236,7 @@ class WhileGradOp : public framework::OperatorBase { attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; - auto var_name = pg_names[param_id]; + auto var_name = pg_ig_names[param_id]; auto zero_op = framework::OpRegistry::CreateOp( "fill_constant", framework::VariableNameMap{}, {{"Out", {var_name}}}, attrs); @@ -213,8 +248,8 @@ class WhileGradOp : public framework::OperatorBase { } auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, + "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}}, + {{"Out", {pg_ig_names[param_id]}}}, framework::AttributeMap{{"use_mkldnn", {false}}}); sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); @@ -281,6 +316,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { parent_block->FindVarRecursive(input_name) != nullptr)) { continue; } + output_grads.insert(input_name); } for (auto &output_name : op->OutputArgumentNames()) { @@ -309,13 +345,13 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto p_names = op_desc.Input(kX); - auto pg_names = op_desc.Output(framework::GradVarName(kX)); + auto pg_ig_names = op_desc.Output(framework::GradVarName(kX)); for (size_t i = 0; i < p_names.size(); ++i) { auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); - auto *g_var = block->FindVarRecursive(pg_names[i]); + auto *g_var = block->FindVarRecursive(pg_ig_names[i]); if (g_var != nullptr) { // Gradient could be @EMPTY@ - VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i] + VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] << " type: " << p_var.GetType(); g_var->SetType(p_var.GetType()); g_var->SetDataType(p_var.GetDataType()); @@ -333,21 +369,21 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ctx->HasInputs(framework::GradVarName(kOutputs)); auto p_names = ctx->Inputs(kX); - auto pg_names = ctx->Outputs(kXGRAD); + auto pg_ig_names = ctx->Outputs(kXGRAD); auto var_types = ctx->GetInputsVarType(kX); std::vector names_to_set; std::vector dims_to_set; for (size_t i = 0; i < p_names.size(); ++i) { - if (pg_names[i] == framework::kEmptyVarName) { + if (pg_ig_names[i] == framework::kEmptyVarName) { continue; } auto dims = ctx->GetInputsElementDim(kX, i); if (var_types[i] == framework::proto::VarType::LOD_TENSOR) { - names_to_set.push_back(pg_names[i]); + names_to_set.push_back(pg_ig_names[i]); dims_to_set.push_back(dims); } else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) { // not sure how to set the dim of LOD_TENSOR_ARRAY - names_to_set.push_back(pg_names[i]); + names_to_set.push_back(pg_ig_names[i]); dims_to_set.push_back(dims); } } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8bc30fc123163983f4bddc19af489920db93e0c0..8b62502e3f920a3bf7d80f9e7edc2f3647a0e5b1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -396,11 +396,6 @@ All parameter, weight, gradient are variables in Paddle. Prune(*prog_with_targets.Proto(), &pruned_desc); return new ProgramDesc(pruned_desc); }); - m.def("inference_optimize", [](ProgramDesc &origin) { - proto::ProgramDesc pruned_desc; - InferenceOptimize(*(origin.Proto()), &pruned_desc); - return new ProgramDesc(pruned_desc); - }); m.def("empty_var_name", []() { return std::string(framework::kEmptyVarName); }); m.def("grad_var_suffix", @@ -675,7 +670,14 @@ All parameter, weight, gradient are variables in Paddle. .def_property( "enable_data_balance", [](const BuildStrategy &self) { return self.enable_data_balance_; }, - [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }); + [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }) + .def_property("fuse_elewise_add_act_ops", + [](const BuildStrategy &self) { + return self.fuse_elewise_add_act_ops_; + }, + [](BuildStrategy &self, bool b) { + self.fuse_elewise_add_act_ops_ = b; + }); pe.def(py::init &, const std::unordered_set &, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 77b9b36e68c88eab35bcc1a88ce08a7b5940d55f..f50a68c54114d5cce15418ad22f38c83163ba866 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -108,7 +108,15 @@ function cmake_gen() { fi fi fi - + + if [ "$SYSTEM" == "Darwin" ]; then + WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON} + WITH_AVX=${WITH_AVX:-ON} + INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo} + else + INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} + fi + cat <>> p, g = backward(...) - >>> with program.optimized_guard([p,g]): + >>> with program._optimized_guard([p,g]): >>> p = p - 0.001 * g """ OpRole = core.op_proto_and_checker_maker.OpRole @@ -1554,7 +1554,7 @@ class Program(object): res_str = _debug_string_(proto, throw_on_error) return res_str - def get_desc(self): + def _get_desc(self): """ Get the C++ side of `ProgramDesc` object pointer. The C++ object is exposed by :code:`pybind`. @@ -1647,7 +1647,7 @@ class Program(object): The two code snippets above will generate same programs. """ if for_test: - p = self.inference_optimize(export_for_deployment=False) + p = self._inference_optimize(prune_read_op=False) else: p = Program() p.current_block_idx = self.current_block_idx @@ -1663,10 +1663,10 @@ class Program(object): p._sync_with_cpp() p._copy_param_info_from(self) - p.copy_data_info_from(self) + p._copy_data_info_from(self) return p - def prune(self, targets): + def _prune(self, targets): """ Prune operators and variables which are not needed to generate :code:`targets`. @@ -1717,7 +1717,7 @@ class Program(object): res._sync_with_cpp() return res - def inference_optimize(self, export_for_deployment=True): + def _inference_optimize(self, prune_read_op=True): """ This method will create a new program and do following adjustments on it: 1. Remove all reader variables and their creator ops if exist. @@ -1729,8 +1729,8 @@ class Program(object): information will be lost. Args: - export_for_deployment(bool): remove the read ops that are added by py_reader - for cpp inference library + prune_read_op(bool): remove the read ops that are added by py_reader + for cpp inference library Notes: This API is a very low level API. Use :code:`Program.clone(for_test=True)` instead. @@ -1738,15 +1738,13 @@ class Program(object): Returns: Program: The new program. """ - # this is an alternative implement before - # core.inference_optimize being fixed. res = Program() res.desc = core.ProgramDesc(self.desc) # remove all readers and the read_op if exist read_op_idx = 0 root_block = res.desc.block(0) - if export_for_deployment: + if prune_read_op: while True: if read_op_idx >= root_block.op_size() or root_block.op( read_op_idx).type() == 'read': @@ -1841,7 +1839,7 @@ class Program(object): """ return self.blocks[self.current_block_idx] - def create_block(self, parent_idx=None): + def _create_block(self, parent_idx=None): """ Create a new block with the :code:`parent_idx` and change the current block to new block. @@ -1860,7 +1858,7 @@ class Program(object): self.blocks.append(Block(self, self.current_block_idx)) return self.current_block() - def rollback(self): + def _rollback(self): """ Exit a code block, i.e., roll back to the parent block. Returns: @@ -1906,7 +1904,7 @@ class Program(object): "program, with represent the same topology") self.global_block()._copy_param_info_from(other.global_block()) - def copy_data_info_from(self, other): + def _copy_data_info_from(self, other): """ Copy the information of data variables from other program. diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index bd46ed8e50c9344d471578eb0f89b7e214d62722..7a7a0078a557c47492a4396897aafabe6c9c5dcb 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -20,10 +20,10 @@ import contextlib from .core import VarDesc __all__ = [ - 'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA', - 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer', - 'UniformInitializer', 'NormalInitializer', 'XavierInitializer', - 'BilinearInitializer', 'MSRAInitializer' + 'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear', + 'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer', + 'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer', + 'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer' ] _force_init_on_cpu_ = False @@ -33,6 +33,8 @@ def force_init_on_cpu(): """ The flag of whether force to init variables on CPU. + Returns:: + Examples: .. code-block:: python @@ -272,6 +274,60 @@ class NormalInitializer(Initializer): return op +class TruncatedNormalInitializer(Initializer): + """Implements the Random TruncatedNormal(Gaussian) distribution initializer + + Args: + loc (float): mean of the normal distribution + scale (float): standard deviation of the normal distribution + seed (int): random seed + + Examples: + .. code-block:: python + + fc = fluid.layers.fc(input=x, size=10, + param_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0)) + """ + + def __init__(self, loc=0.0, scale=1.0, seed=0): + assert loc is not None + assert scale is not None + assert seed is not None + super(NormalInitializer, self).__init__() + self._mean = loc + self._std_dev = scale + self._seed = seed + + def __call__(self, var, block): + """Add truncated normal distribution initialization ops for a variable + + Args: + var: Variable that needs to be initialized + block: The block in which initialization ops + should be added + + Returns: + the initialization op + """ + assert isinstance(var, framework.Variable) + assert isinstance(block, framework.Block) + # Initialization Ops should be prepended and not appended + if self._seed == 0: + self._seed = block.program.random_seed + op = block._prepend_op( + type="truncated_gaussian_random", + outputs={"Out": var}, + attrs={ + "shape": var.shape, + "dtype": int(var.dtype), + "mean": self._mean, + "std": self._std_dev, + "seed": self._seed + }) + var.op = op + return op + + class XavierInitializer(Initializer): """ This class implements the Xavier weight initializer from the paper @@ -583,6 +639,7 @@ class BilinearInitializer(Initializer): Constant = ConstantInitializer Uniform = UniformInitializer Normal = NormalInitializer +TruncatedNormal = TruncatedNormalInitializer Xavier = XavierInitializer MSRA = MSRAInitializer Bilinear = BilinearInitializer diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 656fafa0cb54d70e0eba8ec2bef21488c50d8d94..78bb8a1a0a64631cbe2adc11b1494ceed6d14908 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -20,6 +20,7 @@ import time import shutil import six +from paddle.fluid.executor import Executor from paddle.fluid.evaluator import Evaluator from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable from . import core @@ -515,8 +516,8 @@ def get_inference_program(target_vars, main_program=None): vars.extend(var.metrics) else: vars.append(var) - pruned_program = main_program.prune(targets=vars) - inference_program = pruned_program.inference_optimize() + pruned_program = main_program._prune(targets=vars) + inference_program = pruned_program._inference_optimize() return inference_program @@ -587,8 +588,11 @@ def save_inference_model(dirname, params_filename(str|None): The name of file to save all related parameters. If it is setted None, parameters will be saved in separate files . - export_for_deployment(bool): remove the read ops that are added by py_reader - for cpp inference lib. Default True + export_for_deployment(bool): If True, programs are modified to only support + direct inference deployment. Otherwise, + more information will be stored for flexible + optimization and re-training. Currently, only + True is supported. Returns: None @@ -636,21 +640,28 @@ def save_inference_model(dirname, if not os.path.isdir(dirname): os.makedirs(dirname) - # Clear the is_target information and remove the existed feed and fetch op - global_block = copy_program.global_block() - for i, op in enumerate(global_block.ops): - op.desc.set_is_target(False) - if op.type == "feed" or op.type == "fetch": - global_block._remove_op(i) - copy_program.desc.flush() - - pruned_program = copy_program.prune(targets=target_vars) - inference_program = pruned_program.inference_optimize( - export_for_deployment=export_for_deployment) - fetch_var_names = [v.name for v in target_vars] - - prepend_feed_ops(inference_program, feeded_var_names) - append_fetch_ops(inference_program, fetch_var_names) + # When export_for_deployment is true, we modify the program online so that + # it can only be loaded for inference directly. If it's false, the whole + # original program and related meta are saved so that future usage can be + # more flexible. + if export_for_deployment: + global_block = copy_program.global_block() + for i, op in enumerate(global_block.ops): + op.desc.set_is_target(False) + if op.type == "feed" or op.type == "fetch": + global_block._remove_op(i) + copy_program.desc.flush() + + pruned_program = copy_program._prune(targets=target_vars) + saved_program = pruned_program._inference_optimize(prune_read_op=True) + fetch_var_names = [v.name for v in target_vars] + + prepend_feed_ops(saved_program, feeded_var_names) + append_fetch_ops(saved_program, fetch_var_names) + else: + # TODO(panyx0718): Save more information so that it can also be used + # for training and more flexible post-processing. + saved_program = copy_program if model_filename is not None: model_filename = os.path.basename(model_filename) @@ -662,9 +673,9 @@ def save_inference_model(dirname, params_filename = os.path.basename(params_filename) with open(model_filename, "wb") as f: - f.write(inference_program.desc.serialize_to_string()) + f.write(saved_program.desc.serialize_to_string()) - save_persistables(executor, dirname, inference_program, params_filename) + save_persistables(executor, dirname, saved_program, params_filename) # if there is lookup table, the trainer 0 will notify all pserver to save. if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index c9a2f8a0abf9c811074e3fbadec0c61cb6dbf681..0049773bbeb514d5dfef490e73b9988bd5371029 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -41,7 +41,6 @@ __all__ = [ 'DynamicRNN', 'StaticRNN', 'reorder_lod_tensor_by_rank', - 'ParallelDo', 'Print', 'is_empty', ] @@ -218,10 +217,10 @@ class BlockGuard(object): self.main_program = main_program def __enter__(self): - self.main_program.create_block() + self.main_program._create_block() def __exit__(self, exc_type, exc_val, exc_tb): - self.main_program.rollback() + self.main_program._rollback() if exc_type is not None: return False # re-raise exception return True @@ -259,7 +258,7 @@ class ParallelDo(object): # ParallelDo version & Single-thread version if thread_num > 1: places = fluid.layers.get_places(thread_num) - pd = fluid.layers.ParallelDo(places) + pd = fluid.layers.control_flow.ParallelDo(places) with pd.do(): images = pd.read_input(images) label = pd.read_input(label) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1c73c837e2aa422b67704e171f66f5cd48e171ce..8e86bec8609f17c973389047df66b4d725113e6e 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -723,11 +723,10 @@ def ssd_loss(location, target_label.stop_gradient = True conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples + actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) + actual_shape.stop_gradient = True conf_loss = nn.reshape( - x=conf_loss, - shape=(num, num_prior), - actual_shape=ops.slice( - conf_shape, axes=[0], starts=[0], ends=[2])) + x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) conf_loss.stop_gradient = True neg_indices = helper.create_tmp_variable(dtype='int32') dtype = matched_indices.dtype @@ -796,11 +795,7 @@ def ssd_loss(location, # 5.3 Compute overall weighted loss. loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss # reshape to [N, Np], N is the batch size and Np is the prior box number. - loss = nn.reshape( - x=loss, - shape=(num, num_prior), - actual_shape=ops.slice( - conf_shape, axes=[0], starts=[0], ends=[2])) + loss = nn.reshape(x=loss, shape=(num, num_prior), actual_shape=actual_shape) loss = nn.reduce_sum(loss, dim=1, keep_dim=True) if normalize: normalizer = nn.reduce_sum(target_loc_weight) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 0cf7aaef4ab75ca6976465d1b404004a9f2f64c5..d56fa76300e7054ef71a7729483a579fa35f1dac 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -29,9 +29,8 @@ from ..layer_helper import LayerHelper from ..unique_name import generate as unique_name __all__ = [ - 'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', - 'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor', - 'load' + 'data', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', + 'random_data_generator', 'py_reader', 'Preprocessor', 'load' ] @@ -1008,9 +1007,9 @@ class Preprocessor(object): @contextlib.contextmanager def block(self): self.status = Preprocessor.IN_SUB_BLOCK - self.sub_block = self.main_prog.create_block() + self.sub_block = self.main_prog._create_block() yield - self.main_prog.rollback() + self.main_prog._rollback() self.status = Preprocessor.AFTER_SUB_BLOCK if not self._is_completed(): raise RuntimeError( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3ae0fac4bef5c47964f9a9cd8dd45b57e705e1f8..f896cfa04b3e0d89daaa1bd7fd893b5892a09a4e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -54,6 +54,7 @@ __all__ = [ 'conv2d_transpose', 'conv3d_transpose', 'sequence_expand', + 'sequence_expand_as', 'sequence_pad', 'lstm_unit', 'reduce_sum', @@ -99,6 +100,7 @@ __all__ = [ 'resize_bilinear', 'gather', 'scatter', + 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', @@ -112,6 +114,8 @@ __all__ = [ 'pad2d', 'unstack', 'sequence_enumerate', + 'expand', + 'sequence_concat', ] @@ -1273,7 +1277,7 @@ def sequence_conv(input, return helper.append_activation(pre_act) -def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): +def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=False): """ This function computes the softmax activation among all time-steps for each sequence. The dimension of each time-step should be 1. Thus, the shape of @@ -1296,7 +1300,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): bias_attr (ParamAttr|None): attributes for bias param_attr (ParamAttr|None): attributes for parameter use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \ - library is installed. Default: True + library is installed. Default: False Returns: Variable: output of sequence_softmax @@ -1780,6 +1784,31 @@ def sequence_pool(input, pool_type): return pool_out +@templatedoc() +def sequence_concat(input, name=None): + """ + ${comment} + + Args: + input(list): List of Variables to be concatenated. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: Output variable of the concatenation. + + Examples: + .. code-block:: python + + out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3]) + """ + helper = LayerHelper('sequence_concat', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]}) + return out + + def sequence_first_step(input): """ This function gets the first step of sequence. @@ -2315,16 +2344,20 @@ def conv2d_transpose(input, .. math:: - H_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ - W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 + H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ + W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ + H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ + W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) Args: input(Variable): The input image with [N, C, H, W] format. num_filters(int): The number of the filter. It is as same as the output image channel. output_size(int|tuple|None): The output image size. If output size is a - tuple, it must contain two integers, (image_H, image_W). This - parameter only works when filter_size is None. + tuple, it must contain two integers, (image_H, image_W). None if use + filter_size, padding, and stride to calculate output_size. + if output_size and filter_size are specified at the same time, They + should follow the formula above. filter_size(int|tuple|None): The filter size. If filter_size is a tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise, the filter will be a square. None if use output size to @@ -2402,7 +2435,13 @@ def conv2d_transpose(input, else: filter_size = utils.convert_to_list(filter_size, 2, 'conv2d_transpose.filter_size') - + if output_size is None: + output_size = [] + elif isinstance(output_size, list) or isinstance(output_size, int): + output_size = utils.convert_to_list(output_size, 2, 'output_size') + else: + raise ValueError("output_size should be list or int") + padding = utils.convert_to_list(padding, 2, 'padding') groups = 1 if groups is None else groups filter_shape = [input_channel, num_filters // groups] + filter_size img_filter = helper.create_parameter( @@ -2415,6 +2454,7 @@ def conv2d_transpose(input, 'Filter': [img_filter]}, outputs={'Output': pre_bias}, attrs={ + 'output_size': output_size, 'strides': stride, 'paddings': padding, 'dilations': dilation, @@ -2666,6 +2706,71 @@ def sequence_expand(x, y, ref_level=-1, name=None): return tmp +def sequence_expand_as(x, y, name=None): + """Sequence Expand As Layer. This layer will expand the input variable **x** + according to the zeroth level lod of **y**. Current implementation requires + the level number of Input(Y)'s lod must be 1, and the first dimension of + Input(X) should be equal to the size of Input(Y)'s zeroth level lod, and + lod of Input(X) is not considered. + + Following examples will explain how sequence_expand_as works: + + .. code-block:: text + + * Case 1: + + Given a 1-level LoDTensor input(X) + X.data = [[a], [b], [c], [d]] + X.dims = [4, 1] + and input(Y) + Y.lod = [[0, 3, 6, 7, 8]] + ref_level: 0 + then we get 1-level LoDTensor + Out.lod = [[0, 3, 6, 7, 8]] + Out.data = [[a], [a], [a], [b], [b], [b], [c], [d]] + Out.dims = [8, 1] + + * Case 2: + + Given a common Tensor input(X) + X.data = [[a, b], [c, d], [e, f]] + X.dims = [3, 2] + and input(Y) + Y.lod = [[0, 2, 3, 6]] + ref_level: 0 + then we get a common LoDTensor + Out.lod = [[0, 2, 3, 6]] + Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]] + Out.dims = [6, 2] + + Args: + x (Variable): The input variable which is a Tensor or LoDTensor. + y (Variable): The input variable which is a LoDTensor. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The expanded variable which is a LoDTensor. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], dtype='float32') + y = fluid.layers.data(name='y', shape=[10, 20], + dtype='float32', lod_level=1) + out = layers.sequence_expand_as(x=x, y=y) + """ + helper = LayerHelper('sequence_expand_as', input=x, **locals()) + dtype = helper.input_dtype() + tmp = helper.create_tmp_variable(dtype) + helper.append_op( + type='sequence_expand_as', + inputs={'X': x, + 'Y': y}, + outputs={'Out': tmp}) + return tmp + + @templatedoc() def sequence_pad(x, pad_value, maxlen=None): """ @@ -2684,7 +2789,8 @@ def sequence_pad(x, pad_value, maxlen=None): longest original sequence." Returns: - Variable: The padded sequence batch. All sequences has the same length. + Variable: The padded sequence batch and the original lengths before + padding. All sequences has the same length. Examples: .. code-block:: python @@ -2700,15 +2806,21 @@ def sequence_pad(x, pad_value, maxlen=None): helper = LayerHelper('sequence_pad', input=x, **locals()) dtype = helper.input_dtype() out = helper.create_tmp_variable(dtype) + length = helper.create_tmp_variable(dtype) + + pad_value.stop_gradient = True + length.stop_gradient = True + if maxlen is None: maxlen = -1 helper.append_op( type='sequence_pad', inputs={'X': x, 'PadValue': pad_value}, - outputs={'Out': out}, + outputs={'Out': out, + 'Length': length}, attrs={'padded_length': maxlen}) - return out + return out, length def beam_search(pre_ids, @@ -3388,7 +3500,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): return out -def matmul(x, y, transpose_x=False, transpose_y=False, name=None): +def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None): """ Applies matrix multiplication to two tensors. @@ -3422,6 +3534,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): y (Variable): The input variable which is a Tensor or LoDTensor. transpose_x (bool): Whether to transpose :math:`x` before multiplication. transpose_y (bool): Whether to transpose :math:`y` before multiplication. + alpha (float): The scale of output. Default 1.0. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -3489,8 +3602,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): inputs={'X': x, 'Y': y}, outputs={'Out': out}, - attrs={'transpose_X': transpose_x, - 'transpose_Y': transpose_y}) + attrs={ + 'transpose_X': transpose_x, + 'transpose_Y': transpose_y, + 'alpha': alpha, + }) return out @@ -5310,6 +5426,66 @@ def scatter(input, index, updates, name=None): return out +def sequence_scatter(input, index, updates, name=None): + """ + **Sequence Scatter Layer** + + This operator scatters the Updates tensor to the input X. It uses the LoD + information of Ids to select the rows to update, and use the values in Ids as + the columns to update in each row of X. + + Here is an example: + Given the following input: + .. code-block:: text + input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] + input.dims = [3, 6] + + index.data = [[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], [5], [4]] + index.lod = [[0, 3, 8, 12]] + + updates.data = [[0.3], [0.3], [0.4], [0.1], [0.2], [0.3], [0.4], [0.0], [0.2], [0.3], [0.1], [0.4]] + updates.lod = [[ 0, 3, 8, 12]] + + Then we have the output: + .. code-block:: text + out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.4, 1.3, 1.2, 1.1], + [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]] + out.dims = X.dims = [3, 6] + + Args: + input (Variable): The source input with rank>=1. + index (Variable): A LoD Tensor. The index input of sequence scatter op + where input will be updated. The index input with rank=1. Its dtype + should be int32 or int64 as it is used as indexes. + updates (Variable): A LoD Tensor. The values to scatter to the input + tensor X, must be a LoDTensor with the same LoD information as index. + name (str|None): The output variable name. Default None. + + Returns: + output (Variable): The output is a tensor with the same shape as input. + + Examples: + + .. code-block:: python + + output = fluid.layers.sequence_scatter(input, index, updates) + + """ + helper = LayerHelper('sequence_scatter', **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type="sequence_scatter", + inputs={"X": input, + "Ids": index, + "Updates": updates}, + outputs={"Out": out}) + return out + + @templatedoc() def random_crop(x, shape, seed=None): """ @@ -5925,7 +6101,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): inputs={'X': [x]}, outputs={'Y': out}, attrs={ - 'max_len': maxlen if maxlen is not None else -1, + 'maxlen': maxlen if maxlen is not None else -1, 'out_dtype': out.dtype }) return out @@ -6008,3 +6184,53 @@ def unstack(x, axis=0, num=None): attrs={'axis': axis, 'num': num}) return outs + + +def expand(x, expand_times, name=None): + """Expand operator tiles the input by given times number. You should set times + number for each dimension by providing attribute 'expand_times'. The rank of X + should be in [1, 6]. Please note that size of 'expand_times' must be the same + with X's rank. Following is a using case: + + + .. code-block:: text + + Input(X) is a 3-D tensor with shape [2, 3, 1]: + + [ + [[1], [2], [3]], + [[4], [5], [6]] + ] + + Attr(expand_times): [1, 2, 2] + + Output(Out) is a 3-D tensor with shape [2, 6, 2]: + + [ + [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], + [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] + ] + + Args: + x (Variable): A tensor with rank in [1, 6]. + expand_times (list|tuple): Expand times number for each dimension. + + Returns: + Variable: The expanded variable which is a LoDTensor. After expanding, size of each dimension of Output(Out) is equal to ithe size of the corresponding dimension of Input(X) multiplying the corresponding value given by expand_times. + + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], dtype='float32') + out = fluid.layers.expand(x=x, expand_times=[1, 2, 2]) + """ + helper = LayerHelper('expand', input=x, **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='expand', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'expand_times': expand_times}) + return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 215f0cf2fc5ab4fbd06719ac4790a01dd00080eb..ef7b16a19e10a28bd1cc34496fb908580c5d7330 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -236,7 +236,7 @@ class Optimizer(object): for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue - with param_and_grad[0].block.program.optimized_guard( + with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: optimize_op = self._append_optimize_op(loss.block, @@ -580,7 +580,7 @@ class AdamOptimizer(Optimizer): for param, grad in param_and_grads: if grad is None: continue - with param.block.program.optimized_guard([param, grad]): + with param.block.program._optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, @@ -709,7 +709,7 @@ class AdamaxOptimizer(Optimizer): for param, grad in parameters_and_grads: if grad is None: continue - with param.block.program.optimized_guard([param, grad]): + with param.block.program._optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) main_block.append_op( @@ -1198,7 +1198,7 @@ class ModelAverage(Optimizer): for param, grad in self.params_grads: if grad is None: continue - with param.block.program.optimized_guard([param, grad]): + with param.block.program._optimized_guard([param, grad]): self._append_average_accumulate_op(param) self.apply_program = Program() diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index da38626111a6767e1a76a35d6d1375ccc1283de4..8f4678649f2146c84150ad2659e497bbf0365d03 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -47,7 +47,7 @@ def append_regularization_ops(parameters_and_grads, regularization=None): if grad is None: params_and_grads.append((param, grad)) continue - with param.block.program.optimized_guard([param, grad]): + with param.block.program._optimized_guard([param, grad]): regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index 82f1c6615f3c4ca54bf5e979b55082022cd4da9f..a666507bd9aaf715718d0c17e581079faaeba023 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -15,6 +15,7 @@ from __future__ import print_function from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo import unittest import paddle.fluid as fluid import paddle @@ -147,7 +148,7 @@ def train(word_dict, data, label, input_dim=dict_dim, class_dim=class_dim) else: places = get_places() - pd = fluid.layers.ParallelDo(places) + pd = ParallelDo(places) with pd.do(): cost, acc, _ = net_method( pd.read_input(data), diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index 9fe361425c128590da910128beaccb3336f8ba57..cba486cf5996a0b48da83bf83ab47dbf1153e2f4 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -223,7 +223,7 @@ def infer(use_cuda, save_dirname=None): # Use inference_transpiler to speedup inference_transpiler_program = inference_program.clone() - t = fluid.InferenceTranspiler() + t = fluid.transpiler.InferenceTranspiler() t.transpile(inference_transpiler_program, place) # Construct feed as a dictionary of {feed_target_name: feed_target_data} diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index da216d0cc4a2867cb169240d28235b6db747a818..135f11d24c8fde35995cfe577859874ca8428cd4 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -25,6 +25,7 @@ import numpy import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo BATCH_SIZE = 64 @@ -81,7 +82,7 @@ def train(nn_type, if parallel: places = get_places() - pd = fluid.layers.ParallelDo(places) + pd = ParallelDo(places) with pd.do(): img_ = pd.read_input(img) label_ = pd.read_input(label) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index fe063eb4629dbe06dc65ce98c6c01858db901f03..9191f0fc2037d32159c66ca64911864efb34ae30 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -17,6 +17,7 @@ from __future__ import print_function import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo import unittest import os import numpy as np @@ -84,7 +85,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): [first_word, second_word, third_word, forth_word, next_word]) else: places = get_places() - pd = fluid.layers.ParallelDo(places) + pd = ParallelDo(places) with pd.do(): avg_cost, predict_word = __network__( list( diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index f530f8f4882a23df18c141b51560cb618fce86b5..dab2a52bc9062d66a2b03c933fc00023915b260e 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -20,6 +20,7 @@ import sys import paddle import paddle.fluid as fluid from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo # need to fix random seed and training data to compare the loss # value accurately calculated by the default and the memory optimization @@ -38,7 +39,7 @@ if fluid.core.is_compiled_with_cuda(): place = fluid.CUDAPlace(0) places = get_places(device_count=0, device_type=device_type) -pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) +pd = ParallelDo(places, use_nccl=use_nccl) with pd.do(): x_ = pd.read_input(x) y_ = pd.read_input(y) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 958e72ce27f38e48da17ca738c24e665645ae033..88d36fe639c7437b478efb2ee292d792677403d9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -30,15 +30,15 @@ list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test if(APPLE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext") + if(NOT WITH_DISTRIBUTE) + list(REMOVE_ITEM TEST_OPS test_desc_clone) + list(REMOVE_ITEM TEST_OPS test_program_code) + endif(NOT WITH_DISTRIBUTE) + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) - list(REMOVE_ITEM TEST_OPS test_desc_clone) - list(REMOVE_ITEM TEST_OPS test_debugger) - list(REMOVE_ITEM TEST_OPS test_program_code) - list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) endif() @@ -73,11 +73,13 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$ if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) - set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) - set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + if(NOT APPLE) + set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) + set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) + endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) - py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 56a242b996f67aa4b9c858ab8aaeb1c1cd3bcf60..e97643cddef22465436051a41ef4b825e9634d23 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -47,8 +47,7 @@ def get_numeric_gradient(place, input_to_check, output_names, delta=0.005, - in_place=False, - sum_outputs=None): + in_place=False): # FIXME: change this method by compile time concepts set_input(scope, op, inputs, place) @@ -59,8 +58,6 @@ def get_numeric_gradient(place, sum = [] op.run(scope, place) for output_name in output_names: - if sum_outputs and output_name not in sum_outputs: - continue sum.append( np.array(scope.find_var(output_name).get_tensor()).mean()) return np.array(sum).sum() / len(output_names) @@ -348,7 +345,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + - str(actual_t)) + str(actual_t) + " in class " + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + @@ -407,14 +404,13 @@ class OpTest(unittest.TestCase): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None, - sum_outputs=None): + user_defined_grads=None): places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, in_place, max_relative_error, - user_defined_grads, sum_outputs) + user_defined_grads) def check_grad_with_place(self, place, @@ -424,8 +420,7 @@ class OpTest(unittest.TestCase): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None, - sum_outputs=None): + user_defined_grads=None): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() @@ -448,8 +443,7 @@ class OpTest(unittest.TestCase): input_to_check, output_names, delta=numeric_grad_delta, - in_place=in_place, - sum_outputs=sum_outputs) for input_to_check in inputs_to_check + in_place=in_place) for input_to_check in inputs_to_check ] analytic_grads = self._get_gradient(inputs_to_check, place, output_names, no_grad_set) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 74e9d5c5f91e53a315c85d428571ce45bacede8a..ee291fe746f3a1b6ce18df9fb6aa174a89e2eadd 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase): seed=None, use_parallel_executor=True, use_reduce=False, + fuse_elewise_add_act_ops=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False): def run_executor(exe, feed, fetch_list, program=None): @@ -78,6 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops if use_parallel_executor: exe = fluid.ParallelExecutor( diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 2a320e735bd7db5dc138f8263ba1b5cb115ba197..5bb769b16891d3b7163874751f9bcd25593b4b44 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -35,6 +35,10 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): d_bolck_w = dilations[1] * (f_w - 1) + 1 out_h = (in_h - 1) * stride[0] + d_bolck_h out_w = (in_w - 1) * stride[1] + d_bolck_w + if 'output_size' in attrs: + output_size = attrs['output_size'] + out_h = output_size[0] + 2 * pad[0] + out_w = output_size[1] + 2 * pad[1] out = np.zeros((in_n, out_c, out_h, out_w)) @@ -65,6 +69,7 @@ class TestConv2dTransposeOp(OpTest): def setUp(self): # init as conv transpose self.use_cudnn = False + self.output_size = None self.init_op_type() self.init_test_case() @@ -80,6 +85,8 @@ class TestConv2dTransposeOp(OpTest): 'use_cudnn': self.use_cudnn, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } + if self.output_size is not None: + self.attrs['output_size'] = self.output_size output = conv2dtranspose_forward_naive(input_, filter_, self.attrs).astype('float32') @@ -192,6 +199,18 @@ class TestWithDilation(TestConv2dTransposeOp): self.filter_size = [f_c, 6, 3, 3] +class TestWithEvenUpsample(TestConv2dTransposeOp): + def init_test_case(self): + self.pad = [2, 2] + self.stride = [2, 2] + self.groups = 1 + self.dilations = [1, 1] + self.output_size = [14, 14] + self.input_size = [2, 3, 7, 7] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 5, 5] + + # ------------ test_cudnn ------------ @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") @@ -265,6 +284,15 @@ class TestDepthwiseConvTranspose(TestConv2dTransposeOp): self.op_type = "depthwise_conv2d_transpose" +# ------------ test_cudnn ------------ +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestCUDNNWithEvenUpsample(TestWithEvenUpsample): + def init_op_type(self): + self.use_cudnn = True + self.op_type = "conv2d_transpose" + + # Please Don't remove the following code. # Currently, CI use cudnn V5.0 which not support dilation conv. # class TestCUDNNWithDilation(TestWithDilation): diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f6eb8f2c6d8b94f92e24ff789c91efb53a645a46..0c5343a97d5ef0f97fc6b144dfc82174eacb8573 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -20,6 +20,7 @@ import six import sys import collections import math +import paddle.fluid as fluid from op_test import OpTest @@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): self.detect = np.array(self.detect).astype('float32') self.mAP = np.array(self.mAP).astype('float32') - if (len(self.class_pos_count) > 0): + if len(self.class_pos_count) > 0: self.class_pos_count = np.array(self.class_pos_count).astype( 'int32') self.true_pos = np.array(self.true_pos).astype('float32') @@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() - self.class_pos_count = [0, 2, 1] + self.class_pos_count = [0, 2, 1, 0] self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.false_pos_lod = [[0, 3, 2]] diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index e55f8707a9a8ac2b0d69c65b15e6593025511999..47083ca7e954c85bb42fcc88639f3e757283cbea 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -61,9 +61,7 @@ class TestDistTransformer2x2Sync(TestDistBase): def test_transformer(self): download_files() - #Note: loss on test dataset of the first 5 batch are: - # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855 - self.check_with_place("dist_transformer.py", delta=1e-7) + self.check_with_place("dist_transformer.py", delta=1e-5) class TestDistTransformer2x2Async(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index d84dab1499a267ca081c2e8ea2856c7c4bb627cb..3191eb94d753435d31f1849be2d97b1cf89b220c 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -144,6 +144,142 @@ class TestDynRNN(unittest.TestCase): # loss should be small after 100 mini-batch self.assertLess(val[0], loss_0[0]) + # this unit test is just used to the two layer nested dyn_rnn. + def test_train_nested_dyn_rnn(self): + word_dict = [i for i in range(30)] + + def fake_reader(): + seq_len, label = [[2, 2]], [0, 1] + data = [] + for ele in seq_len: + for j in ele: + data.append([numpy.random.randint(30) \ + for _ in range(j)]) + + while True: + yield data, label + + train_data = paddle.batch(fake_reader, batch_size=2) + + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + sentence = fluid.layers.data( + name='word', shape=[1], dtype='int64', lod_level=2) + label = fluid.layers.data( + name='label', shape=[1], dtype='float32', lod_level=1) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + in_ = rnn.step_input(sentence) + sent_emb = fluid.layers.embedding( + input=in_, size=[len(word_dict), 32], dtype='float32') + out_ = fluid.layers.fc(input=sent_emb, size=100, act='tanh') + + rnn1 = fluid.layers.DynamicRNN() + with rnn1.block(): + in_1 = rnn1.step_input(out_) + out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh') + rnn1.output(out_1) + + last = fluid.layers.sequence_last_step(input=rnn1()) + rnn.output(last) + + last = rnn() + logits = fluid.layers.fc(input=last, size=1, act=None) + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logits, label=label) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(1e-3) + #sgd = fluid.optimizer.Adam(1e-3) + sgd.minimize(loss=loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + exe.run(startup_program) + feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu) + data = next(train_data()) + val = exe.run(main_program, feed=feeder.feed(data), + fetch_list=[loss])[0] + + for _ in range(100): + val = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss])[0] + print(val) + + # this unit test is just used to the two layer nested dyn_rnn. + def test_train_nested_dyn_rnn2(self): + word_dict = [i for i in range(30)] + + def fake_reader(): + seq_len, label = [[2, 2]], [0, 1] + data = [] + for ele in seq_len: + for j in ele: + data.append([numpy.random.randint(30) \ + for _ in range(j)]) + + while True: + yield data, label + + train_data = paddle.batch(fake_reader, batch_size=2) + hidden_size = 32 + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + sentence = fluid.layers.data( + name='word', shape=[1], dtype='int64', lod_level=2) + label = fluid.layers.data( + name='label', shape=[1], dtype='float32', lod_level=1) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + in_ = rnn.step_input(sentence) + sent_emb = fluid.layers.embedding( + input=in_, + size=[len(word_dict), hidden_size], + dtype='float32') + input_forward_proj = fluid.layers.fc(input=sent_emb, + size=hidden_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, + size=hidden_size * 4, + use_peepholes=False) + + rnn1 = fluid.layers.DynamicRNN() + with rnn1.block(): + in_1 = rnn1.step_input(forward) + out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh') + rnn1.output(out_1) + + last = fluid.layers.sequence_last_step(input=rnn1()) + rnn.output(last) + + last = rnn() + logits = fluid.layers.fc(input=last, size=1, act=None) + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logits, label=label) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(1e-3) + #sgd = fluid.optimizer.Adam(1e-3) + sgd.minimize(loss=loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + exe.run(startup_program) + feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu) + data = next(train_data()) + val = exe.run(main_program, feed=feeder.feed(data), + fetch_list=[loss])[0] + + for _ in range(100): + val = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss])[0] + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..03471a4432f2b6bf6220e79e99aa506628b1535b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py @@ -0,0 +1,156 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + +MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" + + +def simple_fc_net(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + if use_feed: + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + else: + reader = fluid.layers.open_files( + filenames=[MNIST_RECORDIO_FILE], + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.io.double_buffer(reader) + img, label = fluid.layers.read_file(reader) + + hidden = img + for _ in range(2): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestMNIST(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + MNIST_RECORDIO_FILE, reader, feeder) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_fuse_elewise_add_act_ops(self, + model, + use_cuda, + random_data=True): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + + def _optimizer(learning_rate=1e-6): + optimizer = fluid.optimizer.SGD( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2Decay(1e-6)) + return optimizer + + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_elewise_add_act_ops=False, + memory_opt=False, + optimizer=_optimizer) + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_elewise_add_act_ops=True, + memory_opt=False, + optimizer=_optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_with_fuse_op(self): + self._compare_fuse_elewise_add_act_ops(simple_fc_net, True) + self._compare_fuse_elewise_add_act_ops(simple_fc_net, False) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, True) + self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py index 4a213c29113e5e23af2caf7fbcb807be3d0166d2..3cf8e7229add49ba67abd7d7996a1eb3c4d13846 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py @@ -48,7 +48,7 @@ def create_test_class(test_case, callback, attrs): 'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - if self.attrs["keep_intermediate_value"]: + if self.attrs["save_intermediate_out"]: self.outputs = { 'Out': self.out, "IntermediateOut": self.intermediate_out @@ -73,22 +73,19 @@ def create_test_class(test_case, callback, attrs): def test_check_output(self): self.check_output() + # FIXME(zcd): the intermediate_out_grad is not checked. def test_check_grad_normal(self): - if self.attrs["keep_intermediate_value"]: - self.check_grad( - ['X', 'Y'], ['Out', 'IntermediateOut'], - max_relative_error=0.005, - sum_outputs=['Out']) + if self.attrs["save_intermediate_out"]: + self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) else: self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) def test_check_grad_ingore_x(self): - if self.attrs["keep_intermediate_value"]: + if self.attrs["save_intermediate_out"]: self.check_grad( - ['Y'], ['Out', 'IntermediateOut'], + ['Y'], ['Out'], max_relative_error=0.005, - no_grad_set=set("X"), - sum_outputs=['Out']) + no_grad_set=set("X")) else: self.check_grad( ['Y'], ['Out'], @@ -96,12 +93,11 @@ def create_test_class(test_case, callback, attrs): no_grad_set=set("X")) def test_check_grad_ingore_y(self): - if self.attrs["keep_intermediate_value"]: + if self.attrs["save_intermediate_out"]: self.check_grad( - ['X'], ['Out', 'IntermediateOut'], + ['X'], ['Out'], max_relative_error=0.005, - no_grad_set=set("Y"), - sum_outputs=['Out']) + no_grad_set=set("Y")) else: self.check_grad( ['X'], ['Out'], @@ -303,39 +299,32 @@ for mode in {0, 1}: relu_add_func = partial(relu_add_func, mode=mode) add_relu_func = partial(add_relu_func, mode=mode) - for recomputation in {True, False}: - for keep_intermediate_value in {True, False}: - suffix = ("_keep_intermediate_value" if keep_intermediate_value else "") \ - + ("_recomputation" if recomputation else "") \ - + ("_mode_"+ str(mode)) - create_test_class('scale_add' + suffix, scale_add_func, { - 'scale': scale, - 'functor_list': ["scale", "elementwise_add"], - 'keep_intermediate_value': keep_intermediate_value, - 'recomputation': recomputation - }) - create_test_class('add_scale' + suffix, add_scale_func, { - 'scale': scale, - 'functor_list': ["elementwise_add", "scale"], - 'keep_intermediate_value': keep_intermediate_value, - 'recomputation': recomputation - }) - create_test_class('add_relu' + suffix, add_relu_func, { - 'functor_list': ["elementwise_add", "relu"], - 'keep_intermediate_value': keep_intermediate_value, - 'recomputation': recomputation - }) - create_test_class('relu_add' + suffix, relu_add_func, { - 'functor_list': ["relu", "elementwise_add"], - 'keep_intermediate_value': keep_intermediate_value, - 'recomputation': recomputation - }) - create_test_class('mul_scale' + suffix, mul_scale_func, { - 'scale': scale, - 'functor_list': ["elementwise_mul", "scale"], - 'keep_intermediate_value': keep_intermediate_value, - 'recomputation': recomputation - }) + for save_intermediate_out in {True, False}: + suffix = ("_save_intermediate_out" if save_intermediate_out else "") \ + + ("_mode_"+ str(mode)) + create_test_class('scale_add' + suffix, scale_add_func, { + 'scale': scale, + 'functor_list': ["scale", "elementwise_add"], + 'save_intermediate_out': save_intermediate_out, + }) + create_test_class('add_scale' + suffix, add_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_add", "scale"], + 'save_intermediate_out': save_intermediate_out, + }) + create_test_class('add_relu' + suffix, add_relu_func, { + 'functor_list': ["elementwise_add", "relu"], + 'save_intermediate_out': save_intermediate_out, + }) + create_test_class('relu_add' + suffix, relu_add_func, { + 'functor_list': ["relu", "elementwise_add"], + 'save_intermediate_out': save_intermediate_out, + }) + create_test_class('mul_scale' + suffix, mul_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_mul", "scale"], + 'save_intermediate_out': save_intermediate_out, + }) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 9f6f03f9cfe3c505a7b1227e2b20db3c3c84c745..f61a447fd77d001c1a9c42c46ba9a0d747e926ec 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -30,7 +30,8 @@ def gru( bias, # 1 x 3D is_reverse, act_state, - act_gate): + act_gate, + dtype='float32'): def _seq_to_batch(lod, is_reverse): idx_in_seq_list = [] seq_lens = lod[0] @@ -71,10 +72,10 @@ def gru( T = sum(lod[0]) N = len(lod[0]) D = weight.shape[0] - batch_gate = np.zeros((T, 3 * D), dtype='float64') - batch_reset_hidden_prev = np.zeros((T, D), dtype='float64') - batch_hidden = np.zeros((T, D), dtype='float64') - hidden = np.zeros((T, D), dtype='float64') + batch_gate = np.zeros((T, 3 * D), dtype=dtype) + batch_reset_hidden_prev = np.zeros((T, D), dtype=dtype) + batch_hidden = np.zeros((T, D), dtype=dtype) + hidden = np.zeros((T, D), dtype=dtype) idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse) h_p = h0[sorted_seqs] @@ -108,23 +109,24 @@ class TestGRUOp(OpTest): self.with_bias = True self.act_state = 'tanh' self.act_gate = 'sigmoid' + self.dtype = 'float64' self.set_confs() T = sum(self.lod[0]) N = len(self.lod[0]) - input = np.random.rand(T, 3 * self.D).astype('float64') - weight = np.random.rand(self.D, 3 * self.D).astype('float64') + input = np.random.rand(T, 3 * self.D).astype(self.dtype) + weight = np.random.rand(self.D, 3 * self.D).astype(self.dtype) bias = np.random.rand( - 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( - (1, 3 * self.D), dtype='float64') + 1, 3 * self.D).astype(self.dtype) if self.with_bias else np.zeros( + (1, 3 * self.D), dtype=self.dtype) h0 = np.random.rand( - N, self.D).astype('float64') if self.with_h0 else np.zeros( - (N, self.D), dtype='float64') + N, self.D).astype(self.dtype) if self.with_h0 else np.zeros( + (N, self.D), dtype=self.dtype) batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru( input, self.lod, h0, weight, bias, self.is_reverse, - ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) + ACTIVATION[self.act_state], ACTIVATION[self.act_gate], self.dtype) self.inputs = {'Input': (input, self.lod), 'Weight': weight} if self.with_bias: @@ -153,6 +155,12 @@ class TestGRUOp(OpTest): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) +class TestGRUOp2(TestGRUOp): + def set_confs(self): + self.D = 19 + self.dtype = 'float32' + + class TestGRUOpNoInitial(TestGRUOp): def set_confs(self): self.with_h0 = False diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b04346b052903959f44aa96f6fccb7d20652e854..9a17d3213c902e0c18123097025349434d271d7f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -382,6 +382,30 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_sequence_scatter(self): + program = Program() + with program_guard(program): + x = layers.data( + name='x', + shape=[3, 6], + append_batch_size=False, + dtype='float32') + idx = layers.data( + name='idx', + shape=[12, 1], + append_batch_size=False, + dtype='int32', + lod_level=1) + updates = layers.data( + name='updates', + shape=[12, 1], + append_batch_size=False, + dtype='float32', + lod_level=1) + out = layers.sequence_scatter(input=x, index=idx, updates=updates) + self.assertIsNotNone(out) + print(str(program)) + def test_lod_reset(self): program = Program() with program_guard(program): @@ -565,6 +589,13 @@ class TestBook(unittest.TestCase): out = layers.cross_entropy(x, label, False, 4) self.assertIsNotNone(out) + def test_expand(self): + program = Program() + with program_guard(program): + x = layers.data(name="input", shape=[10], dtype='int32') + out = layers.expand(x, [1, 2]) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index 67733807f8f8582f68dcfa3f361e13a631a29597..275e5c49d5c298a95b012582a74f8073b800991e 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.optimizer as optimizer from paddle.fluid.framework import Program, program_guard @@ -67,5 +68,34 @@ class TestMemoryTranspiler2(unittest.TestCase): print(str(result_program)) +class TestMemoryTranspiler3(unittest.TestCase): + def setUp(self): + program = Program() + with program_guard(program, startup_program=Program()): + word = fluid.layers.data(name='word', shape=[1], dtype='int64') + emb = [ + fluid.layers.embedding( + word, size=[65536, 256], param_attr='emb') for _ in range(6) + ] + + left = emb.pop(0) + while len(emb) != 0: + right = emb.pop(0) + left = fluid.layers.concat([left, right]) + emb = fluid.layers.mean(left) + fluid.backward.append_backward(emb) + self.program = program + + def test_cascade_reuse(self): + block = self.program.block(0) + # variable reuse in programdesc + # TODO(dzhwinter): confirm cascade strategy. disable temporialy + self.assertTrue("concat_4.tmp_0@GRAD" in block.vars) + # self.assertTrue("concat_3.tmp_0@GRAD" not in block.vars) + # self.assertTrue("concat_2.tmp_0@GRAD" not in block.vars) + # self.assertTrue("concat_1.tmp_0@GRAD" not in block.vars) + # self.assertTrue("concat_0.tmp_0@GRAD" not in block.vars) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py index 4fae11e928dc7e066799a8936bada0e252afaa42..8835b6995e00756bcfd3385f362c292924d98128 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py +++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py @@ -19,6 +19,7 @@ import unittest import paddle.fluid as fluid import paddle import paddle.dataset.mnist as mnist +from paddle.fluid.layers.io import open_recordio_file class TestMultipleReader(unittest.TestCase): @@ -41,7 +42,7 @@ class TestMultipleReader(unittest.TestCase): def test_main(self): with fluid.program_guard(fluid.Program(), fluid.Program()): - data_file = fluid.layers.open_recordio_file( + data_file = open_recordio_file( filename='./mnist.recordio', shapes=[(-1, 784), (-1, 1)], lod_levels=[0, 0], diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index cac132e6e08a8a9ec595236b1a990c0900ea4f0f..4153394c1da776d0a41e1415a09fa7d6f4b14d6d 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -26,7 +26,7 @@ main_program = default_startup_program() class TestOperator(unittest.TestCase): def test_error_type(self): - block = main_program.create_block() + block = main_program._create_block() try: block.append_op() self.assertFail() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py index d7b9af8bac67ef89cc1ae59ccf002c2c488f3436..380e17284421b8b6986e6a808b87dd243e058938 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py @@ -18,6 +18,7 @@ import unittest import paddle.fluid as fluid from paddle.fluid.layers.device import get_places +from paddle.fluid.layers.control_flow import ParallelDo import paddle.fluid.profiler as profiler import numpy import six @@ -120,7 +121,7 @@ class BaseParallelForTest(unittest.TestCase): thread_num = fluid.core.get_cuda_device_count( ) if use_gpu else 8 places = get_places(thread_num) - pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) + pd = ParallelDo(places, use_nccl=use_nccl) data = next(generator) if isinstance(data, fluid.framework.Variable): diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py index 98e609b76982650c9d18f87c3c0637056cc40899..0f0bdfc44a7bec7cdf1af22e2dd291de23293fc8 100644 --- a/python/paddle/fluid/tests/unittests/test_preprocessor.py +++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py @@ -20,6 +20,7 @@ import numpy as np import paddle import paddle.fluid as fluid import paddle.dataset.mnist as mnist +from paddle.fluid.layers.io import open_recordio_file class TestPreprocessor(unittest.TestCase): @@ -43,7 +44,7 @@ class TestPreprocessor(unittest.TestCase): img_expected_res = [] lbl_expected_res = [] with fluid.program_guard(fluid.Program(), fluid.Program()): - data_file = fluid.layers.io.open_recordio_file( + data_file = open_recordio_file( './mnist_for_preprocessor_test.recordio', shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], @@ -64,7 +65,7 @@ class TestPreprocessor(unittest.TestCase): img_actual_res = [] lbl_actual_res = [] with fluid.program_guard(fluid.Program(), fluid.Program()): - data_file = fluid.layers.io.open_recordio_file( + data_file = open_recordio_file( './mnist_for_preprocessor_test.recordio', shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index 0997afc97a97333c914a3027103ec48733b410dc..cb1d94809b4ba99fa9077f99b93689504415b71d 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -28,25 +28,25 @@ class TestProgram(unittest.TestCase): self.assertEqual(-1, b.parent_idx) self.assertEqual(0, b.idx) - b = main_program.create_block() + b = main_program._create_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - b = main_program.create_block() + b = main_program._create_block() self.assertEqual(2, b.idx) self.assertEqual(1, b.parent_idx) - main_program.rollback() + main_program._rollback() b = main_program.current_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - b = main_program.create_block() + b = main_program._create_block() self.assertEqual(3, b.idx) self.assertEqual(1, b.parent_idx) - main_program.rollback() + main_program._rollback() b = main_program.current_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) @@ -120,9 +120,9 @@ class TestProgram(unittest.TestCase): main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): net() - no_read_program = main_program.inference_optimize() - keep_read_program = main_program.inference_optimize( - export_for_deployment=False) + no_read_program = main_program._inference_optimize() + keep_read_program = main_program._inference_optimize( + prune_read_op=False) no_read_ops = no_read_program.global_block().ops keep_read_ops = keep_read_program.global_block().ops self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2) diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py index c5210bb2085bc386df43cd0d20292d7b308a1093..f5009556adc8951aad80532d77cac4b920887c66 100644 --- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py +++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py @@ -19,6 +19,7 @@ import unittest import paddle.fluid as fluid import paddle import paddle.dataset.mnist as mnist +from paddle.fluid.layers.io import open_recordio_file class TestRecordIO(unittest.TestCase): @@ -40,7 +41,7 @@ class TestRecordIO(unittest.TestCase): def test_main(self, decorator_callback=None): # use new program with fluid.program_guard(fluid.Program(), fluid.Program()): - data_file = fluid.layers.open_recordio_file( + data_file = open_recordio_file( './mnist.recordio', shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 0557593657e2e480a509902a07f25723b2c710b0..7691221a5511ce83176ff07c231873f66ca371ed 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -79,7 +79,7 @@ class TestReshapeOpWithInputShape(OpTest): self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(["X"], "Out", sum_outputs=["Out"]) + self.check_grad(["X"], "Out") if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py deleted file mode 100644 index 9d1d139721ad7ee3e29d44c9b3e7c666b78a4556..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import numpy as np -import sys -from op_test import OpTest - - -def to_abs_offset_lod(lod): - offset_lod = [[0] for i in lod] - for i, level in enumerate(lod): - for seq_len in level: - offset_lod[i].append(offset_lod[i][-1] + seq_len) - - if len(offset_lod) == 0 or len(offset_lod) == 1: - return offset_lod - import copy - new_offset_lod = copy.deepcopy(offset_lod) - for idx, val in enumerate(offset_lod[0]): - new_offset_lod[0][idx] = offset_lod[1][val] - return new_offset_lod - - -def seq_concat(inputs, level): - lod0 = inputs['X'][0][1][1] - lod1 = inputs['X'][1][1][1] - x0 = inputs['X'][0][1][0] - x1 = inputs['X'][1][1][0] - level_idx = len(lod0) - level - 1 - outs = [] - for i in range(len(lod0[level_idx])): - sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod( - lod0)[level_idx][i + 1], :] - sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod( - lod1)[level_idx][i + 1], :] - outs.append(np.concatenate((sub_x0, sub_x1), axis=0)) - return np.concatenate(outs, axis=0) - - -class TestSeqConcatOp(OpTest): - def set_data(self): - # two level, batch size is 3 - x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[2, 2], [1, 1, 1, 1]] - x1 = np.random.random((4, 8, 3)).astype('float32') - lod1 = [[2, 2], [1, 1, 1, 1]] - axis = 1 - level = 1 - self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} - self.attrs = {'axis': axis, 'level': level} - self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)} - - def setUp(self): - self.op_type = "sequence_concat" - self.set_data() - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['x0'], 'Out') - - -class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp): - def set_data(self): - # two level, batch size is 3 - x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[2, 2], [1, 1, 1, 1]] - x1 = np.random.random((7, 6, 3)).astype('float32') - lod1 = [[2, 2], [1, 2, 2, 2]] - axis = 0 - level = 0 - self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} - self.attrs = {'axis': axis, 'level': level} - out_lod = [[2, 2], [2, 3, 3, 3]] - self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} - - -class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp): - def set_data(self): - # two level, batch size is 3 - x0 = np.random.random((4, 6, 3)).astype('float32') - lod0 = [[2, 2], [1, 1, 1, 1]] - x1 = np.random.random((7, 6, 3)).astype('float32') - lod1 = [[3, 1], [1, 2, 2, 2]] - axis = 0 - level = 1 - self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} - self.attrs = {'axis': axis, 'level': level} - out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]] - self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} - - -class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp): - def set_data(self): - # two level, batch size is 3 - x0 = np.random.random((4, 3, 4)).astype('float32') - lod0 = [[1, 1, 1, 1]] - x1 = np.random.random((7, 3, 4)).astype('float32') - lod1 = [[1, 2, 2, 2]] - axis = 0 - level = 0 - self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]} - self.attrs = {'axis': axis, 'level': level} - out_lod = [[2, 3, 3, 3]] - self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)} - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py index 66e77714c5d65d51262f76519901032182985ea8..641eb03a5fbf1bb140b20cc3518cea83386fa577 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_pool.py +++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py @@ -31,11 +31,11 @@ class TestSeqAvgPool(OpTest): self.op_type = 'sequence_pool' # one level, batch size is 4 x = np.random.uniform(0.1, 1, [11, 23]).astype('float32') - lod = [[4, 1, 3, 3]] + lod = [[11]] self.inputs = {'X': (x, lod)} offset = self.convert_to_offset(lod) - out = np.zeros((4, 23)).astype('float32') + out = np.zeros((len(lod[0]), 23)).astype('float32') self.outputs = {'Out': out} return x, offset, out @@ -71,7 +71,7 @@ class TestSeqMaxPool(TestSeqAvgPool): def set_data(self): self.op_type = 'sequence_pool' x = np.random.uniform(0.1, 1, [13, 23]).astype('float32') - lod = [[4, 1, 3, 5]] + lod = [[13]] offset = self.convert_to_offset(lod) for i in range(len(offset[0]) - 1): l = offset[0][i + 1] - offset[0][i] @@ -79,7 +79,7 @@ class TestSeqMaxPool(TestSeqAvgPool): self.inputs = {'X': (x, lod)} - out = np.zeros((4, 23)).astype('float32') + out = np.zeros((1, 23)).astype('float32') self.outputs = {'Out': out} return x, offset, out diff --git a/python/paddle/fluid/tests/unittests/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/test_sequence_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..db99001cecc95fb4c684dacbd379bb88c8aec9fc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_concat.py @@ -0,0 +1,45 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSequenceConcat(OpTest): + def setUp(self): + x1 = np.random.random(size=(10, 80)) + lod1 = [7, 3] + x2 = np.random.random(size=(20, 80)) + lod2 = [12, 8] + + out = np.concatenate((x1[0:lod1[0]], x2[0:lod2[0]], x1[lod1[0]:], + x2[lod2[0]:])) + out_lod = [19, 11] + + self.op_type = "sequence_concat" + self.inputs = {'X': [("x1", (x1, [lod1])), ("x2", (x2, [lod2]))]} + self.outputs = {"Out": (out, [out_lod])} + + def test_output(self): + self.check_output(1e-3) + + def test_dx(self): + self.check_grad(inputs_to_check=['x1', 'x2'], output_names="Out") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py new file mode 100644 index 0000000000000000000000000000000000000000..4ac97f7ed49fa7e6537efad134ab1320639dce9d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_expand_as.py @@ -0,0 +1,77 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSequenceExpandAs(OpTest): + def setUp(self): + self.op_type = 'sequence_expand_as' + self.set_data() + self.compute() + + def set_data(self): + x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32') + y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32') + y_lod = [[1, 3, 4]] + self.inputs = {'X': x_data, 'Y': (y_data, y_lod)} + + def compute(self): + x = self.inputs['X'] + x_data, x_lod = x if type(x) == tuple else (x, None) + y_data, y_lod = self.inputs['Y'] + + assert len(y_lod) == 1 and len(y_lod[0]) == x_data.shape[0] + + repeats = [] + for i in range(len(y_lod[0])): + repeat_num = y_lod[0][i] + if repeat_num == 0: + continue + repeats.extend([i for _ in range(repeat_num)]) + + out_data = x_data[repeats] + self.outputs = {'Out': (out_data, y_lod)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequenceExpandAsCase1(TestSequenceExpandAs): + def set_data(self): + x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32') + x_lod = [[2, 3]] + y_data = np.random.uniform(0.1, 1, [10, 1]).astype('float32') + y_lod = [[2, 2, 0, 3, 3]] + self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + + +class TestSequenceExpandAsCase2(TestSequenceExpandAs): + def set_data(self): + x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32') + x_lod = [[1]] + y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32') + y_lod = [[2]] + self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py index 471515c817541976a06eb024fa3d4f77b78f920d..3067294e5bb3edcb2f1ce77f5e60b885a39a6475 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py @@ -62,7 +62,8 @@ class TestSequencePadOp(OpTest): start_idx = end_idx out_data = np.array(padded_sequences) - self.outputs = {'Out': out_data} + length = np.array(self.x_len_lod[0]).reshape((-1, 1)) + self.outputs = {'Out': out_data, 'Length': length} def setUp(self): self.op_type = 'sequence_pad' @@ -129,3 +130,7 @@ class TestSequencePadOp7(TestSequencePadOp): self.pad_value = [1.0] self.padded_length = 7 self.dtype = 'float32' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f3d239e9c798745cbb3dda9df56dbd717aab74ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSequenceScatterOp(OpTest): + def setUp(self): + self.op_type = "sequence_scatter" + + X_data = np.random.uniform(0.1, 1.0, [3, 6]).astype('float32') + Ids_data = np.array([[0], [1], [2], [5], [4], [3], [2], [1], [3], [2], + [5], [4]]).astype('int64') + Ids_lod = [[3, 5, 4]] + Updates_data = np.random.uniform(0.1, 1.0, [12, 1]).astype('float32') + Updates_lod = Ids_lod + + Out_data = np.copy(X_data) + Out_data[0][Ids_data[0:3]] += Updates_data[0:3] + Out_data[1][Ids_data[3:8]] += Updates_data[3:8] + Out_data[2][Ids_data[8:]] += Updates_data[8:] + + self.inputs = { + 'X': X_data, + 'Ids': (Ids_data, Ids_lod), + 'Updates': (Updates_data, Updates_lod) + } + self.outputs = {'Out': Out_data} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Updates'], 'Out', in_place=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index c30da2389d50d3b6bdf1f911aaed6ed71f274153..bbcabb751f0761705ff268c4408dc8673bb01b81 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -34,7 +34,7 @@ class TestTransposeOp(OpTest): self.check_output(no_check_set=['XShape']) def test_check_grad(self): - self.check_grad(['X'], 'Out', sum_outputs=['Out']) + self.check_grad(['X'], 'Out') def initTestCase(self): self.shape = (3, 4) diff --git a/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py new file mode 100644 index 0000000000000000000000000000000000000000..4abeae77d26e8def85596aefc6c2f89cd4e4d6f0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py @@ -0,0 +1,69 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy + +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.executor import Executor + + +class TestTrunctedGaussianRandomOp(unittest.TestCase): + def setUp(self): + self.op_type = "truncated_gaussian_random" + self.inputs = {} + self.attrs = { + "shape": [10000], + "mean": .0, + "std": 1., + "seed": 10, + } + + self.outputs = ["Out"] + + def test_cpu(self): + self.gaussian_random_test(place=fluid.CPUPlace()) + + def test_gpu(self): + if core.is_compiled_with_cuda(): + self.gaussian_random_test(place=fluid.CUDAPlace(0)) + + def gaussian_random_test(self, place): + + program = fluid.Program() + block = program.global_block() + vout = block.create_var(name="Out") + op = block.append_op( + type=self.op_type, outputs={"Out": vout}, attrs=self.attrs) + + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + fetch_list = [] + for var_name in self.outputs: + fetch_list.append(block.var(var_name)) + + exe = Executor(place) + outs = exe.run(program, fetch_list=fetch_list) + tensor = outs[0] + self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1) + self.assertAlmostEqual(numpy.var(tensor), 0.773, delta=0.1) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index f0e74aff6bdfa7d9f0a7f10e64cac4de88009f0a..ab7a18d4c5c4ce1e490e2951ff9fbb023324e753 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -19,6 +19,7 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers +from paddle.fluid.layers.io import open_recordio_file pos_enc_param_names = ( "src_pos_enc_table", @@ -405,7 +406,7 @@ def transformer( src_pad_idx, trg_pad_idx, pos_pad_idx, ): - file_obj = fluid.layers.open_recordio_file( + file_obj = open_recordio_file( filename='/tmp/wmt16.recordio', shapes=[ [batch_size * max_length, 1], diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 8429e2fd7c5141f064c66d8f406889bca1510fe2..28c7ae5341b20f0f79da8cf682d279fc4cc3fa19 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -20,6 +20,6 @@ from .memory_optimization_transpiler import memory_optimize, release_memory from .ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "InferenceTranspiler", "memory_optimize", - "release_memory", "HashName", "RoundRobin", "DistributeTranspilerConfig" + "DistributeTranspiler", "memory_optimize", "release_memory", "HashName", + "RoundRobin", "DistributeTranspilerConfig" ] diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 53c9cbe23dd82af866658fe46d1d631b0a3b26f3..f58f1883a407a3123856e19b5ec8fc01862466a7 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -247,7 +247,7 @@ class DistributeTranspiler(object): np.random.seed(self.origin_program.random_seed) np.random.shuffle(grad_var_mapping_items) - grad_name_to_send_dummy_out = dict() + self.grad_name_to_send_dummy_out = dict() for grad_varname, splited_vars in grad_var_mapping_items: eplist = ps_dispatcher.dispatch(splited_vars) @@ -271,7 +271,7 @@ class DistributeTranspiler(object): dummy_output = program.global_block().create_var( name=framework.generate_control_dev_var_name()) - grad_name_to_send_dummy_out[grad_varname] = dummy_output + self.grad_name_to_send_dummy_out[grad_varname] = dummy_output # get send op_role_var, if not splited, the grad should have .trainer suffix # if splited, grad should be the original grad var name (split_by_ref and send @@ -297,7 +297,12 @@ class DistributeTranspiler(object): if self.sync_mode: send_barrier_out = program.global_block().create_var( name=framework.generate_control_dev_var_name()) - input_deps = grad_name_to_send_dummy_out.values() + if self.has_distributed_lookup_table: + self.grad_name_to_send_dummy_out[ + self.table_name] = program.global_block().create_var( + name=framework.generate_control_dev_var_name()) + input_deps = self.grad_name_to_send_dummy_out.values() + program.global_block().append_op( type="send_barrier", inputs={"X": list(input_deps)}, @@ -329,7 +334,7 @@ class DistributeTranspiler(object): recv_dep_in = send_barrier_out else: # connect deps to send op in async mode - recv_dep_in = grad_name_to_send_dummy_out[ + recv_dep_in = self.grad_name_to_send_dummy_out[ self.param_name_to_grad_name[param_varname]] all_recv_outputs.extend(splited_var) # get recv op_role_var, if not splited, the grad should have .trainer suffix @@ -575,7 +580,7 @@ class DistributeTranspiler(object): assert isinstance(origin_block, Block) # we put the new sub block to new block to follow the block # hierarchy of the original blocks - new_sub_block = program.create_block(lr_block.idx) + new_sub_block = program._create_block(lr_block.idx) # clone vars for var in origin_block.vars: @@ -595,7 +600,7 @@ class DistributeTranspiler(object): # record optimize blocks and we can run them on pserver parallel optimize_blocks = [] if len(lr_ops) > 0: - lr_decay_block = pserver_program.create_block( + lr_decay_block = pserver_program._create_block( pserver_program.num_blocks - 1) optimize_blocks.append(lr_decay_block) for _, op in enumerate(lr_ops): @@ -608,7 +613,7 @@ class DistributeTranspiler(object): grad_to_block_id = [] pre_block_idx = pserver_program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): - per_opt_block = pserver_program.create_block(pre_block_idx) + per_opt_block = pserver_program._create_block(pre_block_idx) optimize_blocks.append(per_opt_block) # append grad merging ops before clip and weight decay # cases may like: @@ -631,7 +636,7 @@ class DistributeTranspiler(object): grad_to_block_id = list(set(grad_to_block_id)) # append global ops if global_ops: - opt_state_block = pserver_program.create_block( + opt_state_block = pserver_program._create_block( pserver_program.num_blocks - 1) optimize_blocks.append(opt_state_block) for glb_op in global_ops: @@ -1046,9 +1051,13 @@ class DistributeTranspiler(object): index=op_index + 2, type="send", inputs={'X': self.trainer_side_table_grad_list}, - outputs={'Out': []}, + outputs={ + 'Out': + [self.grad_name_to_send_dummy_out[self.table_name]] + if self.sync_mode else [] + }, attrs={ - "sync_mode": True, + "sync_mode": False, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ @@ -1064,7 +1073,7 @@ class DistributeTranspiler(object): table_var = pserver_program.global_block().vars[self.table_name] prefetch_var_name_to_block_id = [] for index in range(len(self.all_prefetch_input_vars)): - prefetch_block = pserver_program.create_block(optimize_block.idx) + prefetch_block = pserver_program._create_block(optimize_block.idx) trainer_ids = self.all_prefetch_input_vars[index][pserver_index] pserver_ids = pserver_program.global_block().create_var( name=trainer_ids.name, @@ -1122,7 +1131,7 @@ class DistributeTranspiler(object): if 'Param' in op.input_names and op.input("Param")[0] == self.table_name ][0] - table_opt_block = pserver_program.create_block(pre_block_idx) + table_opt_block = pserver_program._create_block(pre_block_idx) if self.sync_mode: # create grad vars in pserver program @@ -1185,7 +1194,7 @@ class DistributeTranspiler(object): persistable=True, type=core.VarDesc.VarType.RAW) - checkpoint_save_block = pserver_program.create_block(pre_block_idx) + checkpoint_save_block = pserver_program._create_block(pre_block_idx) # this 'file_path' do not be used in save lookup table variable checkpoint_save_block.append_op( type='save', diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py old mode 100644 new mode 100755 index 3e58e125de4188144646236f7999c620cd8e9459..d4517059a4b033eec20ef6903894426ccbd597d7 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -56,6 +56,7 @@ class ControlFlowGraph(object): self._live_in = defaultdict(set) self._live_out = defaultdict(set) self._skip_opt = skip_opt + self.pool = [] def _add_connections(self, connections): """Populates _successors and _presuccessors for two neighbor nodes.""" @@ -77,6 +78,7 @@ class ControlFlowGraph(object): for i in range(self.op_size): self._uses[i].update(self._ops[i].input_arg_names()) self._defs[i].update(self._ops[i].output_arg_names()) + self._live_in[i] = self._uses[i] def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -88,39 +90,39 @@ class ControlFlowGraph(object): self._defs[i].add(new_name) if old_name in self._live_in[i]: self._live_in[i].remove(old_name) - self._live_out[i].add(new_name) + self._live_in[i].add(new_name) if old_name in self._live_out[i]: self._live_out[i].remove(old_name) self._live_out[i].add(new_name) - def _reach_fixed_point(self, live_in, live_out): - """Check if the liveness set has stablized.""" - if len(live_in) != len(self._live_in): - return False - if len(live_out) != len(self._live_out): - return False - for i in range(self.op_size): - if (live_in[i] != self._live_in[i] or - live_out[i] != self._live_out[i]): - return False - return True - def _dataflow_analyze(self): self._build_graph() live_in = defaultdict(set) - live_out = defaultdict(set) - # Repeatedly apply liveness updates until the algorithm stablize - # on a complete set live input vars and live output vars. - while True: - for i in reversed(list(range(self.op_size))): - live_in[i] = set(self._live_in[i]) - live_out[i] = set(self._live_out[i]) - for s in self._successors[i]: - self._live_out[i] |= self._live_in[s] - self._live_in[i] = self._uses[i] | ( - self._live_out[i] - self._defs[i]) - if self._reach_fixed_point(live_in, live_out): - break + worklist = list(range(len(self._ops) - 1, -1, -1)) + while worklist: + i = worklist.pop(0) + live_in[i] = set(self._live_in[i]) + for s in self._successors[i]: + self._live_out[i] |= self._live_in[s] + self._live_in[i] = self._uses[i] | ( + self._live_out[i] - self._defs[i]) + if live_in[i] != self._live_in[i]: + for d in self._presuccessors[i]: + worklist.append(d) + + def _fill_pool(self, i, is_forward): + block_desc = self._ops[i].block() + in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) + can_optimize = [ + x for x in in_diff + if self._check_var_validity(block_desc, x, is_forward) + ] + if can_optimize: + for var_name in can_optimize: + cache = (var_name, self._find_var(block_desc, var_name, + is_forward).shape()) + if cache not in self.pool: + self.pool.append(cache) def _get_diff(self, a, b): u = a & b @@ -211,7 +213,6 @@ class ControlFlowGraph(object): # update skip set to meet users' demand if skip_opt_set: self._skip_opt.update(skip_opt_set) - self.pool = [] for i in range(self.op_size): op = self._ops[i] if op.type() in SUB_BLOCK_OPS: @@ -234,16 +235,24 @@ class ControlFlowGraph(object): for index, cache_pair in enumerate(self.pool): cache_var = cache_pair[0] cache_shape = cache_pair[1] - if not compare_shape(x_shape, cache_shape, level): - continue - if not self._has_var(block_desc, cache_var, is_forward): + if PRINT_LOG: + print("cache %s not exists!" % + (cpt.to_text(cache_var))) continue + if x == cache_var: + if PRINT_LOG: + print("x : ", cpt.to_text(x), " cache : ", + cpt.to_text(cache_var), " is same var!") + break x_dtype = self._find_var(block_desc, x, is_forward).dtype() cache_dtype = self._find_var(block_desc, cache_var, is_forward).dtype() + + if not compare_shape(x_shape, cache_shape, level): + continue # TODO(qijun): actually, we should compare # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype] if x_dtype != cache_dtype: @@ -256,8 +265,6 @@ class ControlFlowGraph(object): "var shape is %s ") % (index, x, cache_var, str(cache_shape))) self.pool.pop(index) - if x == cache_var: - break # Rename the var to the cache var already with # memory allocated in order to reuse the memory. _rename_arg_(self._ops, x, cache_var, begin_idx=i) @@ -266,16 +273,7 @@ class ControlFlowGraph(object): is_forward) self._update_graph(x, cache_var, begin_idx=i) break - - in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) - can_optimize = [ - x for x in in_diff - if self._check_var_validity(block_desc, x, is_forward) - ] - if can_optimize: - for var_name in can_optimize: - self.pool.append((var_name, self._find_var( - block_desc, var_name, is_forward).shape())) + self._fill_pool(i, is_forward) def _process_sub_block_pair(pdesc, sub_block_pair): @@ -357,7 +355,7 @@ def _get_cfgs(input_program): :return: A list of ControlFlowGraph, each corresponds to a block. """ ops_list = [] - pdesc = input_program.get_desc() + pdesc = input_program._get_desc() block_desc = pdesc.block(0) op_size = block_desc.op_size() @@ -383,10 +381,13 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): Note: it doesn't not support subblock nested in subblock. - :param input_program: Input Program - :param print_log: whether to print debug log. - :param level: If level=0, reuse if the shape is completely equal, o - :return: + Args: + input_program(str): Input Program + skip_opt_set(set): vars wil be skipped in memory optimze + print_log(bool): whether to print debug log. + level(int): If level=0, reuse if the shape is completely equal, o + Returns: + None """ if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") @@ -407,6 +408,9 @@ def release_memory(input_program, skip_opt_set=None): Args: input_program(Program): The program will be inserted :code:`delete_op`. + skip_opt_set(set): vars wil be skipped in memory optimze + Returns: + None """ cfgs = _get_cfgs(input_program) for cfg in cfgs: diff --git a/python/requirements.txt b/python/requirements.txt index f8298a63612cb217ce0e711e78fffdf86b73313d..84cf440397b994ba12fa70d9e316e788f34e2415 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,7 +2,7 @@ requests==2.9.2 numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version protobuf==3.1 recordio>=0.1.0 -matplotlib +matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib rarfile scipy>=0.19.0 Pillow