diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 83fe9af768964003130d02b7d913ad1c2102dd1d..59661c9c1da53a2ddac0127ed1827fedde811a1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,6 +31,3 @@ - id: go-fmt types: - go - - id: gometalinter - types: - - go diff --git a/CMakeLists.txt b/CMakeLists.txt index 4921226ec1c90a969fa1cfc383823820500c7757..1252e7539816016dfdf1b90b8941fa42e6bb85e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,6 +86,14 @@ if(ANDROID OR IOS) "Disable MKLDNN when cross-compiling for Android and iOS" FORCE) set(WITH_MKLML OFF CACHE STRING "Disable MKLML package when cross-compiling for Android and iOS" FORCE) + + # Compile PaddlePaddle mobile inference library + if (NOT WITH_C_API) + set(WITH_C_API ON CACHE STRING + "Always compile the C_API when cross-compiling for Android and iOS" FORCE) + endif() + set(MOBILE_INFERENCE ON) + add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING @@ -97,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON) "different Python interpreter from compiling.") endif() +if(MOBILE_INFERENCE) + set(THIRD_PARTY_BUILD_TYPE MinSizeRel) +else() + set(THIRD_PARTY_BUILD_TYPE Release) +endif() + ######################################################################################## include(external/mklml) # download mklml package @@ -160,9 +174,11 @@ endif(USE_NNPACK) add_subdirectory(proto) -# "add_subdirectory(go)" should be placed after the following loine, -# because it depends on paddle/optimizer. -add_subdirectory(paddle/optimizer) +if(NOT MOBILE_INFERENCE) + # "add_subdirectory(go)" should be placed after the following loine, + # because it depends on paddle/optimizer. + add_subdirectory(paddle/optimizer) +endif() # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be # placed after this block, because they depends on it. diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 51c3b918cc4ef4cf6c8052ccc14028a872309fcf..db8f5ab0456792f903093b9cf20e2541f00add5c 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -24,6 +24,10 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) +endif(WITH_TESTING) + if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) @@ -49,11 +53,12 @@ if(NOT WITH_GOLANG) endif(NOT WITH_GOLANG) if(NOT WITH_GPU) - add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() + add_definitions(-DPADDLE_WITH_CUDA) + FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f7483f6be9169eb58f0148cd3a956a8c881e1fe3..bd853d921b4362ac7ac5e17e629552b2a200f08a 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -8,7 +8,7 @@ ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG "master" + GIT_TAG 4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 957f8271e4841836956b0c3f2cf3d8c88a31192a..c819eb4d70898e48eab499c666168d78262d4240 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -36,6 +36,7 @@ ExternalProject_Add( # change this back to the official Github repo once my PR is # merged. GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git" + GIT_TAG 986964c07427ecb9cdb5bd73f73ebbd40e54dadb PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -45,11 +46,11 @@ ExternalProject_Add( -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index b3fef738ccc0b5886bb0a32501bb7b7adade0ff1..08bdc1e1623b0d917061c7368e9b2a8f7e9517fd 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -31,6 +31,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY "https://github.com/google/glog.git" + GIT_TAG v0.3.5 PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -43,12 +44,12 @@ ExternalProject_Add( -DWITH_GFLAGS=ON -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 6a2a79b7631b32e8a099797de509af64533bbb95..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -56,11 +56,11 @@ IF(WITH_TESTING) -DBUILD_GMOCK=ON -Dgtest_disable_pthreads=ON -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7cf7ba85cca4c248dcc74e078124c0b3815ee380..be7f6a9465970711170bd15dcecaadeaa8a55f86 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ${OPTIONAL_ARGS} -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON ${OPTIONAL_CACHE_ARGS} diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index bb258c7b5581fc22b44f4fe15c119f8081f4767e..8bd058222880b4df3b08da09c02f9fe7f1d0ee66 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -35,6 +35,7 @@ ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" + GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -48,9 +49,9 @@ ExternalProject_Add( -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON -DBUILD_SHARED=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c496a52b780364f3014f8fa3dfbc944a7aa7430e..e2c9fe56f335ae5b627b4d8d4bb17e4a2a466677 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -42,11 +42,11 @@ ExternalProject_Add( -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_MACOSX_RPATH=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) LIST(APPEND external_project_dependencies zlib) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index ff9868fc4e0d970b11e4763d2e0c8581f4f85907..c311783aa3187678c31c27ddbbd074790ca444f3 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -389,13 +389,60 @@ function(go_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endfunction(go_test) +# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support +# Usage: +# paddle_protobuf_generate_cpp( ) + +function(paddle_protobuf_generate_cpp SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + + if (MOBILE_INFERENCE) + set(EXTRA_FLAG "lite:") + else() + set(EXTRA_FLAG "") + endif() + + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${_protobuf_protoc_src}") + list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") + + add_custom_command( + OUTPUT "${_protobuf_protoc_src}" + "${_protobuf_protoc_hdr}" + + COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + + function(proto_library TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(proto_srcs) set(proto_hdrs) - protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) endfunction() diff --git a/cmake/util.cmake b/cmake/util.cmake index d1aee3e170a2d143ac06b438725e907e96f041c8..117ab7f49cdf4a568cd203b2b17767643d0b2d50 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -73,25 +73,43 @@ function(link_paddle_exe TARGET_NAME) generate_rdma_links() endif() - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_pserver - paddle_trainer_lib - paddle_network - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - paddle_optimizer - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) + if(MOBILE_INFERENCE) + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + else() + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_pserver + paddle_trainer_lib + paddle_network + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + paddle_optimizer + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + endif() if(ANDROID) target_link_libraries(${TARGET_NAME} log) diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..cf146dc088e3905a751ff55c26fd82ef0ba02c89 100644 --- a/doc/api/v1/index_cn.rst +++ b/doc/api/v1/index_cn.rst @@ -21,7 +21,7 @@ Model Config API trainer_config_helpers/optimizers.rst trainer_config_helpers/data_sources.rst trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst + trainer_config_helpers/activations.rst trainer_config_helpers/poolings.rst trainer_config_helpers/networks.rst trainer_config_helpers/evaluators.rst diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index c94627a72806fa2eca77c79da24f7f3ca18f0259..d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -345,6 +345,11 @@ clip .. autoclass:: paddle.v2.layer.clip :noindex: +resize +------ +.. autoclass:: paddle.v2.layer.resize + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept diff --git a/doc/design/block.md b/doc/design/block.md index be8800122035984df281692fc40009c397565046..7cbf0d55b1faeb2093ee7cf234d1c2ad1905885b 100644 --- a/doc/design/block.md +++ b/doc/design/block.md @@ -5,12 +5,12 @@ Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: - Caffe, Torch, and Paddle: sequences of layers. -- TensorFlow, Caffe2, Mxnet: graphs of operators. +- TensorFlow, Caffe2, Mxnet: graph of operators. - PaddlePaddle: nested blocks, like C++ and Java programs. ## Block in Programming Languages and Deep Learning -In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators. +In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: @@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas ## Stack Frames and the Scope Hierarchy -The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other: +The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: -| programming languages | PaddlePaddle | -|-----------------------|-------------------------------| -| stack | scope hierarchy | -| stack frame | scope | -| push at entering block| push at entering block | -| pop at leaving block | destroy at minibatch completes| +| programming languages | PaddlePaddle | +|-----------------------|---------------------------------| +| stack | scope hierarchy | +| stack frame | scope | +| push at entering block| push at entering block | +| pop at leaving block | destroy when minibatch completes| 1. In traditional programs: @@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog 1. In PaddlePaddle - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. - - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass. So it has a stack forest known as a *scope hierarchy*. + - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. - The height of the highest tree is the maximum depth of nested blocks. - - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy. + - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. ## Use Blocks in C++ and PaddlePaddle Programs @@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples. The following C++ programs shows how blocks are used with the `if-else` structure: ```c++ +namespace pd = paddle; + int x = 10; -int y = 20; -int out; +int y = 1; +int z = 10; bool cond = false; +int o1, o2; if (cond) { int z = x + y; - out = softmax(z); + o1 = z; + o2 = pd::layer::softmax(z); } else { - int z = fc(x); - out = z; + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; } + ``` An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: @@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator ```python import paddle as pd -x = var(10) -y = var(20) -cond = var(false) -ie = pd.create_ifelseop(inputs=[x], output_num=1) +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() with ie.true_block(): - x = ie.inputs(true, 0) - z = operator.add(x, y) - ie.set_output(true, 0, operator.softmax(z)) + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) with ie.false_block(): - x = ie.inputs(false, 0) - z = layer.fc(x) - ie.set_output(true, 0, operator.softmax(z)) -out = b(cond) + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) ``` -In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`. +In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . + +The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. -A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values. ### Blocks with `for` and `RNNOp` -The following RNN model from the [RNN design doc](./rnn.md) +The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) : ```python -x = sequence([10, 20, 30]) -m = var(0) -W = tensor() -U = tensor() - -rnn = create_rnn(inputs=[input]) -with rnn.stepnet() as net: - x = net.set_inputs(0) - h = net.add_memory(init=m) - fc_out = pd.matmul(W, x) - hidden_out = pd.matmul(U, h.pre(n=1)) - sum = pd.add_two(fc_out, hidden_out) - act = pd.sigmoid(sum) - h.update(act) # update memory with act - net.set_outputs(0, act, hidden_out) # two outputs - +x = sequence([10, 20, 30]) # shape=[None, 1] +m = var(0) # shape=[1] +W = var(0.314, param=true) # shape=[1] +U = var(0.375, param=true) # shape=[1] + +rnn = pd.rnn() +with rnn.step(): + h = rnn.memory(init = m) + h_prev = rnn.previous_memory(h) + a = layer.fc(W, x) + b = layer.fc(U, h_prev) + s = pd.add(a, b) + act = pd.sigmoid(s) + rnn.update_memory(h, act) + rnn.output(a, b) o1, o2 = rnn() -print o1, o2 ``` - has its equivalent C++ program as follows ```c++ int* x = {10, 20, 30}; -int m = 0; -int W = some_value(); -int U = some_other_value(); +int* m = {0}; +int* W = {0.314}; +int* U = {0.375}; int mem[sizeof(x) / sizeof(x[0]) + 1]; int o1[sizeof(x) / sizeof(x[0]) + 1]; @@ -131,25 +135,21 @@ int o2[sizeof(x) / sizeof(x[0]) + 1]; for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { int x = x[i-1]; if (i == 1) mem[0] = m; - int fc_out = W * x; - int hidden_out = Y * mem[i-1]; - int sum = fc_out + hidden_out; + int a = W * x; + int b = Y * mem[i-1]; + int s = fc_out + hidden_out; int act = sigmoid(sum); mem[i] = act; o1[i] = act; o2[i] = hidden_out; } - -print_array(o1); -print_array(o2); ``` - ## Compilation and Execution -Like TensorFlow programs, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference. +Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. -The generation of this protobuf message is like what a compiler generates a binary executable file. The execution of the message that the OS executes the binary file. +The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. ## The "Binary Executable File Format" @@ -186,8 +186,8 @@ Also, the RNN operator in above example is serialized into a protobuf message of ``` OpDesc { - inputs = {0} // the index of x - outputs = {5, 3} // indices of act and hidden_out + inputs = {0} // the index of x in vars of BlockDesc above + outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above attrs { "memories" : {1} // the index of h "step_net" : @@ -203,32 +203,32 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope. -Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example +Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example: ```python -a = pd.Varaible(shape=[20, 20]) +a = pd.Variable(shape=[20, 20]) b = pd.fc(a, params=["fc.w", "fc.b"]) rnn = pd.create_rnn() -with rnn.stepnet() as net: - x = net.set_inputs(a) +with rnn.stepnet(): + x = a.as_step_input() # reuse fc's parameter fc_without_b = pd.get_variable("fc.w") - net.set_outputs(fc_without_b) + rnn.output(fc_without_b) out = rnn() ``` -the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. +The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. -`SymbolTable` can do the following stuff: +`SymbolTable` can do the following: - store the definitions (some names and attributes) of variables and operators, -- to verify if a variable was declared, -- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). +- verify if a variable was declared, +- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). ```c++ @@ -240,19 +240,18 @@ class SymbolTable { OpDesc* NewOp(const string& name=""); - // TODO determine whether name is generated by python or C++ - // currently assume that a unique name will be generated by C++ if the - // argument name left default. - VarDesc* NewVar(const string& name=""); + // TODO determine whether name is generated by python or C++. + // Currently assume that a unique name will be generated by C++ if the + // argument name is left default. + VarDesc* Var(const string& name=""); - // find a VarDesc by name, if recursive true, find parent's SymbolTable + // find a VarDesc by name, if recursive is true, find parent's SymbolTable // recursively. // this interface is introduced to support InferShape, find protobuf messages // of variables and operators, pass pointers into InferShape. - // operator // // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should - // be proposed and embedded into pybind to enable python operate on C++ pointers. + // be proposed and embedded into pybind to enable python operation on C++ pointers. VarDesc* FindVar(const string& name, bool recursive=true); OpDesc* FindOp(const string& name); @@ -270,7 +269,7 @@ class SymbolTable { After all the description of variables and operators is added into SymbolTable, the block has enough information to run. -The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions. +The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. ```c++ @@ -302,7 +301,7 @@ public: void CreateVariables(const framework::Scope& scope); void CreateOperators(); - // some other necessary interfaces of NetOp are list below + // some other necessary interfaces of NetOp are listed below // ... private: @@ -316,15 +315,14 @@ private: Block inherits from OperatorBase, which has a Run method. Block's Run method will run its operators sequentially. -There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block, -after `Run`, `Eval` will get the latest value and return the targets. +There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. The definition of Eval is as follows: ```c++ // clean a block description by targets using the corresponding dependency graph. // return a new BlockDesc with minimal number of operators. -// NOTE not return a Block but the block's description so that this can be distributed +// NOTE: The return type is not a Block but the block's description so that this can be distributed // to a cluster. BlockDesc Prune(const BlockDesc& desc, vector targets); diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png new file mode 100644 index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28 Binary files /dev/null and b/doc/design/dcgan.png differ diff --git a/doc/design/executor.md b/doc/design/executor.md new file mode 100644 index 0000000000000000000000000000000000000000..b5fb6c5c3c1da3c112ce63878322083dd5c42b70 --- /dev/null +++ b/doc/design/executor.md @@ -0,0 +1,23 @@ +# Executor Design Doc + +## Motivation + +We use executor to do the runtime evaluation of a `ProgramDesc`. + +## Overview + +An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs. + +### What does executor do? + +It evaluates all the operators in the `block_id`th block of a `ProgramDesc`. + +### What does executor NOT do? + +It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run. + +It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices. + +## Implementation + +`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc) diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md new file mode 100644 index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55 --- /dev/null +++ b/doc/design/gan_api.md @@ -0,0 +1,253 @@ +# Design for GAN + +GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. + +It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. + +In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. + +

+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. +

+ +The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. + +

+
+Figure 2. Photo borrowed from the original DC-GAN paper. +

+ +## The Conditional-GAN might be a class. +This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: + +- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: + +- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. + +- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. +Returns a generated image. + +- discriminator(image): +Given an image, decide if it is from a real source or a fake one. +Returns a 0/1 binary label. + +- build_model(self): +build the whole GAN model, define training loss for both generator and discrimator. + +## Discussion on Engine Functions required to build GAN +- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) +- Different optimizers responsible for optimizing different loss. + +To be more detailed, we introduce our design of DCGAN as following: + +### Class member Function: Initializer +- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. +- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. +```python +class DCGAN(object): + def __init__(self, y_dim=None): + + # hyper parameters + self.y_dim = y_dim # conditional gan or not + self.batch_size = 100 + self.z_dim = z_dim # input noise dimension + + # define parameters of discriminators + self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) + self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W2 = pd.Varialble(np.random.rand(128, 1)) + self.D_b2 = pd.Variable(np.zeros(128)) + self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] + + # define parameters of generators + self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W2 = pd.Varialble(np.random.rand(128, 1)) + self.G_b2 = pd.Variable(np.zeros(128)) + self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] +``` + +### Class member Function: Generator +- Given a noisy input z, returns a fake image. +- Concatenation, batch-norm, FC operations required; +- Deconv layer required, which is missing now... +```python +class DCGAN(object): + def generator(self, z, y = None): + # input z: the random noise + # input y: input data label (optional) + # output G_im: generated fake images + + if not self.y_dim: + z = pd.layer.concat(1, [z, y]) + + G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) + G_h0_bn = pd.layer.batch_norm(G_h0) + G_h0_relu = pd.layer.relu(G_h0_bn) + + G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) + G_h1_bn = pd.layer.batch_norm(G_h1) + G_h1_relu = pd.layer.relu(G_h1_bn) + + G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) + G_im = pd.layer.tanh(G_im) + return G_im +``` + +### Class member function: Discriminator +- Given a noisy input z, returns a fake image. +- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; +```python +class DCGAN(object): + def discriminator(self, image): + # input image: either generated images or real ones + # output D_h2: binary logit of the label + + D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) + D_h0_bn = pd.layer.batchnorm(h0) + D_h0_relu = pd.layer.lrelu(h0_bn) + + D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) + D_h1_bn = pd.layer.batchnorm(D_h1) + D_h1_relu = pd.layer.lrelu(D_h1_bn) + + D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) + return D_h2 +``` + +### Class member function: Build the model +- Define data readers as placeholders to hold the data; +- Build generator and discriminators; +- Define two training losses for discriminator and generator, respectively. +If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self): + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_t = self.discriminator(self.images) + # generated fake images + self.sampled = self.sampler(self.z, self.y) + self.D_f = self.discriminator(self.G) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_t = self.discriminator(self.images) + # generate fake images + self.sampled = self.sampler(self.z) + self.D_f = self.discriminator(self.images) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake + + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) +``` + +If we do not have dependency engine but blocks, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self, default_block): + # input data in the default block + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + with pd.default_block().g_block(): + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_g = self.discriminator(self.G, self.y) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_g = self.discriminator(self.G, self.y) + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) + + with pd.default_block().d_block(): + if self.y_dim: # if conditional GAN, includes label + self.D_t = self.discriminator(self.images, self.y) + self.D_f = self.discriminator(self.G, self.y) + else: # original version of GAN + self.D_t = self.discriminator(self.images) + self.D_f = self.discriminator(self.G) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake +``` +Some small confusion and problems with this design: +- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. +- Requires ability to create a block anytime, rather than in if-else or rnn only; + +## Main function for the demo: +Generally, the user of GAN just need to the following things: +- Define an object as DCGAN class; +- Build the DCGAN model; +- Specify two optimizers for two different losses with respect to different parameters. +```python +# pd for short, should be more concise. +from paddle.v2 as pd +import numpy as np +import logging + +if __name__ == "__main__": + # dcgan class in the default graph/block + # if we use dependency engine as tensorflow + # the codes, will be slightly different like: + # dcgan = DCGAN() + # dcgan.build_model() + with pd.block() as def_block: + dcgan = DCGAN() + dcgan.build_model(def_block) + + # load mnist data + data_X, data_y = self.load_mnist() + + # Two subgraphs required!!! + with pd.block().d_block(): + d_optim = pd.train.Adam(lr = .001, beta= .1) + d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) + with pd.block.g_block(): + g_optim = pd.train.Adam(lr = .001, beta= .1) + g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) + + # executor + sess = pd.executor() + + # training + for epoch in xrange(10000): + for batch_id in range(N / batch_size): + idx = ... + # sample a batch + batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] + # sample z + batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) + + if batch_id % 2 == 0: + sess.run(d_step, + feed_dict = {dcgan.images: batch_im, + dcgan.y: batch_label, + dcgan.z: batch_z}) + else: + sess.run(g_step, + feed_dict = {dcgan.z: batch_z}) +``` + +# More thinking about dependency engine v.s. block design: +- What if we just want to run an intermediate result? Do we need to run the whole block/graph? +- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md index 954a19c0733358c235eae3cffe134c23dac94c95..26d140f06db4ecefa86be015eaa731ffddc6910c 100644 --- a/doc/design/if_else_op.md +++ b/doc/design/if_else_op.md @@ -1,41 +1,51 @@ -IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack(). +# The `IfElse` Operator -```python -import paddle as pd +PaddlePaddle's `IfElse` operator differs from TensorFlow's: -x = var() -y = var() -cond = var() -default_value = var() -b = pd.create_ifelseop(inputs=[x], output_num=1) -with b.true_block(): - x = b.inputs(0) - z = operator.add(x, y) - b.set_output(0, operator.softmax(z)) - -with b.false_block(): - x = b.inputs(0) - z = layer.fc(x) - b.set_output(0, operator.softmax(z)) - -out = b(cond) -``` +- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas +- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. + +## Example + +The following PaddlePaddle program shows the usage of the IfElse operator: -If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as: ```python import paddle as pd -x = var() -y = var() -cond = var() -default_value = var() -b = pd.create_ifelseop(inputs=[x], output_num=1, default_value) - -with b.true_block(): - x = b.inputs(0) - z = operator.add(x, y) - b.set_output(0, operator.softmax(z)) +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` -out = b(cond) +A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. + +An equivalent C++ program is as follows: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int d = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} ``` -where default_value is a list of vars for `cond` == False. diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot index 8d1b673abf6b78c851676fa379dc850c4818f0e5..e115f9844bae6ad24f638c8ed4749cea8aff06a9 100644 --- a/doc/design/images/graph_construction_example.dot +++ b/doc/design/images/graph_construction_example.dot @@ -33,7 +33,6 @@ digraph ImageClassificationGraph { cost -> MSE_Grad [color=red]; d_cost -> MSE_Grad [color=red]; - x -> MSE_Grad [color=red]; l -> MSE_Grad [color=red]; y -> MSE_Grad -> d_y [color=red]; diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png index 181187503472d15779b87284105841168b3945c4..261611a5721f9aa97874f7e6d897fe48cf667db2 100644 Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png index 3049a9315fd616464dec54e33064cb75598ca536..4c69687f4a6a181138f3df72ce5e8aa48487b5be 100644 Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png index 25d19088cbf0b5f68cf734f2ff21eba8af4a2860..e668c16e0cac73acb4e5dc2b1827557ae77126b4 100644 Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a --- /dev/null +++ b/doc/design/infer_var_type.md @@ -0,0 +1,78 @@ +# Design Doc: InferVarType + +## The Problem Posed + +The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. + +For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. + +The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. + +## Proposed Solution + +The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: + + +```c++ +using InferVarTypeFN = std::function< + void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; +``` + +It takes an operator description as its input and will write the output variable type and store them in block description. + +The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + InferVarTypeFN infer_var_type_; + ... +}; +``` + +The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. + +```cpp +void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { + // set the output type of variable as `LoDTensor`. + // ... +} + +struct OpInfo { + InferVarTypeFN infer_var_type_; + InferVarTypeFN GetInferVarType() const { + if (infer_var_type_) { + return infer_var_type_; + } else { + return DefaultInferVarType; + } + } +}; +``` + +## Register InferVarType + +We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. + +```cpp +class VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; +} +``` + +Operator developers can write the specialize `VarTypeInferer` as follow. + +```cpp +class SpecialVarTypeInferer : public VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { + // .. own logic + } +} +``` + +Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. + +``` +REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); +``` diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..17440fae5028cfac5d58fc079ca2096d0be3a0f9 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,105 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_backward_pass(self, loss, parameter_list=None): + """ + create and add gradient Operators in BlockDesc to Compute gradients of `loss` + for parameters in parameter_list + + Args: + loss: an variable generated by cost function. + parameter_list: parameters that need to compute gradient and update to optimize the lost. + + Returns: + list of (parameters, gradients) pair. + """ + return None + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `create_backward_pass()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/design/program.md b/doc/design/program.md index fb8f86ac07af403c9fee015f2a3adbfaa3c6d631..bd2456787c4e336d357a65255a8274a7c9e465cc 100644 --- a/doc/design/program.md +++ b/doc/design/program.md @@ -1,8 +1,10 @@ -# Design Doc: ProgramDesc +# Design Doc: PaddlePaddle Programs -The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. +## Compile and Execution + +A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. -As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program +A simple example PaddlePaddle program can be found in [graph.md](./graph.md): ```python x = layer.data("images") @@ -13,36 +15,112 @@ optimize(cost) train(cost, reader=mnist.train()) ``` -generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message: +The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. -```protobuf -message ProgramDesc { - repeated BlockDesc blocks = 1; +## Programs and Blocks + +The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. + +- program: some nested blocks +- [block](./block.md): + - some local variable definitions, and + - a sequence of operators + +The concept of block comes from usual programs. For example, the following C++ program has three blocks: + +```c++ +int main() { // block 0 + int i = 0; + if (i < 10) { // block 1 + for (int j = 0; j < 10; j++) { // block 2 + } + } + return 0; } +``` + +The following PaddlePaddle program has three blocks: + +```python +import paddle as pd // block 0 + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] +ie = pd.ifelse() +with ie.true_block(): // block 1 + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): // block 2 + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +## `BlockDesc` and `ProgramDesc` + +All protobuf messages are defined in `framework.proto`. + +`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. + +```protobuf message BlockDesc { required int32 parent = 1; repeated VarDesc vars = 2; repeated OpDesc ops = 3; } +``` + +The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. + +All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. + +```protobuf +message ProgramDesc { + repeated BlockDesc blocks = 1; +} +``` + + +### Global Block +The global block is the first one in the above array. + +## Operators that Use Blocks + +In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. + +The definition of `OpDesc` shows that an operator could have some attributes: + +```protobuf message OpDesc { AttrDesc attrs = 1; ... } +``` + +and an attribute could be of type block, which is, in fact, a block ID as described above: +``` message AttrDesc { - required AttrType type = 1; + required string name = 1; - // index into ProgramDesc::blocks when type==BLOCK - optional int32 block = 2; + enum AttrType { + INT = 1, + STRING = 2, + ... + BLOCK = ... + } + required AttrType type = 2; + + optional int32 block = 10; // when type == BLOCK ... } ``` -When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions. This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks. This requires that we can trace the parent of a block. - -A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp. In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`. So that `AttrDesc::block` could be an integer block ID. +## InferShape With this design, the InferShape function should take the following parameters: diff --git a/doc/design/python_api.md b/doc/design/python_api.md new file mode 100644 index 0000000000000000000000000000000000000000..cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c --- /dev/null +++ b/doc/design/python_api.md @@ -0,0 +1,284 @@ +# Design Doc: Python API + +Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. + +| Python classes | Protobuf messages | +| --- | --- | +| Program | ProgramDesc | +| Block | BlockDesc | +| Operator | OpDesc | +| Variable | VarDesc | + +Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. + +## Core Concepts + +### Program + +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. + +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. + +```python +class Program(objects): + def __init__(self): + self.desc = core.NewProgram() # a C++ ProgramDesc pointer. + self.blocks = vector() + self.blocks.append(Block(self, -1)) # the global block + self.current_block = 0 # initialized to the global block + + def global_block(): + return self.blocks[0] + + def current_block(): + return self.get_block(self.current_block) + + def rollback(): + self.current_block = self.current_block().parent_idx + + def create_block(): + new_block_idx = len(self.block) + self.blocks.append(Block(self, self.current_block)) + self.current_block = new_block_idx + return current_block() +``` + +`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`. + +`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block. + +### Block + +A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes + +1. a map from variable names to an instance of the Python `Variable` class, and +1. a list of `Operator` instances. + +```python +class Block(objects): + def __init__(self, program, parent_idx): + self.desc = core.NewBlock(program.desc) + self.program = program + self.vars = map() + self.ops = vector() + self.parent_idx = parent_idx + + def create_var(self, ...): + return Variable(self, ...) + + def _create_global_var(self, ...): + program.global_block().create_var(...) + + def create_parameter(self, name, ...): + # Parameter is a subclass of variable. See Parameter section for details. + self.vars[name] = Parameter(self._create_global_var(...), ...) + return self.vars[name] + + def append_operator(self, ...): + self.ops.append(Operator(self, ...)) + + def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. + self.ops.prepend(Operator(self, ...)) +``` + +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. + +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. + +### Operator + +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. + +```python +class Operator(object): + def __init__(self, + block, # Block + type, # string + inputs, # dict + outputs,# dict + attrs # dict + ): + self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) + core.infer_shape(self.desc, inputs, outputs) + + def type(self): + return self.desc.type() +``` + +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. + +### Variable + +Operators take Variables as its inputs and outputs. + +```python +class Variable(object): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + ): + if name is None: + name = unique_name_generator() + self.name = name + self.block = block + self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) + self.writer = None +``` + +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. + +### Parameter + +A parameter is a global variable with an initializer (or load) operator. + +```python +class Parameter(Variable): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + trainable, # bool + initialize_op_attrs, + optimize_op_attrs): + super(Parameter, self).__init__(block, name, shape, dtype, lod_level) + self.trainable = trainable + self.optimize_op_attrs = optimize_op_attrs + block.prepend(Operator(block, # Block + initialize_op_attrs['type'], # string + None, # no inputs + self, # output is the parameter + initialize_op_attrs) +``` + +When users create a parameter, they can call + +```python +program.create_parameter( + ..., + init_attr={ + type: "uniform_random", + min: -1.0, + max: 1.0, + }) +) +``` + +In above example, `init_attr.type` names an initialize operator. It can also name the load operator + +```python +init_attr={ + type: "load", + filename: "something.numpy", +} +``` + +`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. + +## Layer Function + +A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. + +Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). + +For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. + + +### Necessity for reusing code between layer functions + +There are a lot of code that can be reused. Such as + +* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. +* Append the activation operator. +* Create a temporary variable. +* Create parameter. +* Generate a unique name. +* Add a bias. +* ... + +A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. + + + +### Comparision between global functions and helper class + +The `FullyConnected` layer will be as follow when we provide global functions: + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + if name is None: + name = unique_name("fc") + input = multiple_input(input) + param_attr = default_param_attr(param_attr) + param_attr = multiple_param_attr(param_attr, len(input)) + + # mul + mul_results = [] + for ipt, attr in zip(input, param_attr): + shape = ipt.shape[1:] + [size] + w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) + tmp = create_tmp_var(name) + g_program.current_block().append_op("mul", {ipt, w}, {tmp}) + mul_results.append(tmp) + + # add sum + ... + # add bias + ... + # add activation + ... + return out +``` + +We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: + +1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. +2. Global functions will force layer developers to pass its parameter time by time. + +So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + helper = LayerHelper(locals()) # pass all parameter to LayerHelper + + mul_results = [] + for ipt, param in helper.iter_multiple_input_and_param(): + w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) + tmp = helper.create_tmp_variable() + helper.append_op('mul', {ipt, w}, {tmp}) + mul_results.append(tmp) + + pre_bias = helper.add_sum(mul_results) + pre_activation = helper.add_bias(pre_bias) + return helper.add_activation(pre_activation) +``` + +We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. + + +### Implementation of layer helper + +We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: + +```python +class LayerHelper(object): + def __init__(self, **kwargs): # kwargs is short for `keyword arguments` + self.kwargs = kwargs + + def add_activation(self, input_var): + act = self.kwargs.get("act", None) # default value is None + if act is None: # do nothing if no act + return input_var + + tmp = self.create_tmp_var(self) + self.append_op(type=act, input=input_var, output=tmp) + return tmp +``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md new file mode 100644 index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8 --- /dev/null +++ b/doc/design/refactor/session.md @@ -0,0 +1,180 @@ +# Design Doc: Session + +## Abstract + +The *session* object encapsulates the environment in which the +computation graph is executed. + +We will have the *local* session and *remote* session, they offer the +same [interface](#interface). The local session encapsulates the local +runtime environment and the remote session encapsulates the cluster +runtime environment. + +The local runtime environment contains: + +1. computation devices (i.e., CPU, GPU) handles, and +1. the [scope](../scope.md) which holds all variables. + +The remote runtime environment contains: + +1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster, + and +1. the distributed [scope](../scope.md) in a cluster which holds all + variables. + +The user can create a remote session on Paddle Cloud and evaluate the +computation graph with it. In this way, the user can control the +remote computation resource in a cluster from his local computer. + + +## Background + +The current design has an implicit global session in which +`paddle.eval()` is executed. The pain point is: + +Since the user is not able to explicitly switch between runtime +environments, the user cannot run a topology in two independent +environments. + +For example, in reinforcement learning, the user may want to have a +stale model for inference and a fresh model for training, and only +replace the stale model with the fresh model periodically. + +Furthermore, we have no concept that encapsulates a remote environment +that executes a computation graph. + +We need the session object to address above issues. + + +## Session + +A session is an object that owns the runtime environment. All +computations are executed through `session.eval()`. + + +### Interface + +```python +eval( + targets, + feed_dict=None, +) +``` + +Evaluates the target Operations or Variables in `targets`. + +- *targets*: the evaluation targets. Can be a single Operation or + Variable, or a list with the Operations or Variables as + elements. The value returned by `eval()` has the same shape as the + `target` argument. + + The PaddlePaddle program is represented by + the [ProgramDesc](../design/program.md), `eval()` will infer the + ProgramDesc from the given targets and run the PaddlePaddle + program. Please + see + [this graph](./distributed_architecture.md#local-training-architecture) for + the detailed illustration for the local session + and + [this graph](./distributed_architecture.md#distributed-training-architecture) for + the detailed illustration for the remote session. + +- *feed_dict*: a dictionary that contains the tensors which override + the edges of the computation graph. + + feed_dict not only can provide the input data, it can override any + OP's input as well: + + ```python + a = pd.constant(2.0, name="a") + b = pd.variable(name="b") + c = pd.mul(a,b) + sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0 + ``` + +```python +close() +``` + +Closes the session and releases the scope that the session owns. + + +### Create a Local Session + +```python +session( + devices=None +) +``` + +Creates a new session. One session owns one global scope, so creating +multiple sessions will create different scopes. + +- *devices*: a single `string` or a list of `string` of device names, + the corresponding devices will be the computation devices for + `eval()`. If not specified, all available devices (e.g., all GPUs) + will be used. The user doesn't need to specify the CPU device since + it will be always used. Multiple sessions can use the same device. + + +#### Example + +```Python +a = paddle.constant(1.0) +b = paddle.constant(2.0) +c = a + b +sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"]) +sess.eval(c) +sess.close() +``` + +### Create a Remote Session + +```python +create_cloud_job( + name, + num_trainer, + mem_per_trainer, + gpu_per_trainer, + cpu_per_trainer, + num_ps, + mem_per_ps, + cpu_per_ps, +) +``` + +Creates a Paddle Cloud job. Fails if the job name exists. + +```python +get_cloud_job( + name +) +``` + +Gets a Paddle Cloud job. + +```python +remote_session( + job +) +``` + +- *job*: the Paddle Cloud job. + +#### Example + +```Python +reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud +image = reader.column(0) +label = reader.column(1) +fc1 = paddle.op.fc(image, size=256, act="sigmoid") +fc2 = paddle.op.fc(fc1, size=10, act="softmax") +cost = paddle.op.cross_entropy(fc2, label) +opt = paddle.optimizer.sgd(cost) + +job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1) +sess = paddle.remote_ession(job) +for i in range(1000): + sess.eval(opt) +sess.close() +``` diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md index a2a353c28374213605be0996fcff75ad13d736f1..bf240225046e1512e240502b1e16a2c5ea5c0d83 100644 --- a/doc/design/refactorization.md +++ b/doc/design/refactorization.md @@ -17,22 +17,22 @@ The goals of refactoring include: 1. A graph is composed of *variables* and *operators*. -1. The description of graphs must be capable of being serialized/deserialized, so that: +1. The description of graphs must be serializable/deserializable, so that: - 1. It can to be sent to the cloud for distributed execution, and + 1. It can be sent to the cloud for distributed execution, and 1. It can be sent to clients for mobile or enterprise deployment. -1. The Python program does the following steps +1. The Python program does two things - 1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to + 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to 1. the C++ library `libpaddle.so` for local execution, 1. the master process of a distributed training job for training, or 1. the server process of a Kubernetes serving job for distributed serving. - 1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. + 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. ## Description and Realization of Computation Graph -At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph. +At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. At runtime, the C++ program realizes the graph and runs it. @@ -42,11 +42,11 @@ At runtime, the C++ program realizes the graph and runs it. |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| |Block|BlockDesc|Block| -The word *graph* is interchangeable with *block* in this document. A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). +The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). ## Compilation and Execution -1. Run an application Python program to describe the graph. In particular, the Python application program does the following: +1. Run a Python program to describe the graph. In particular, the Python application program does the following: 1. Create `VarDesc` to represent local/intermediate variables, 1. Create operators and set attributes, @@ -54,10 +54,10 @@ The word *graph* is interchangeable with *block* in this document. A graph repr 1. Infer the type and the shape of variables, 1. Plan memory-reuse for variables, 1. Generate the backward graph - 1. Optimize the computation graph. - 1. Potentially, split the graph for distributed training. + 1. Add optimization operators to the computation graph. + 1. Optionally, split the graph for distributed training. -1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following: +1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, 1. realize local variables defined in the BlockDesc message in the new scope, @@ -107,8 +107,8 @@ Compile Time -> IR -> Runtime ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) * `Operator` is the fundamental building block of the user interface. - * Operator stores input/output variable names, and attributes. - * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables. + * Operator stores input/output variable names and attributes. + * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. * Use `Run` to compute the `output` variables from the `input` variables. --- @@ -139,7 +139,7 @@ Compile Time -> IR -> Runtime * Limit the number of `tensor.device(dev) = ` in your code. * `thrust::transform` and `std::transform`. * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. - * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`. + * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. * Hand-writing `GPUKernel` and `CPU` code * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) --- @@ -182,10 +182,12 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) 1. Write an Op class and its gradient Op class, if required. 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. 3. Invoke the macro `REGISTER_OP`. This macro will - 1. Call maker class to complete the `proto` and the `checker` + 1. Call maker class to complete `proto` and `checker` 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` +4. Invoke the `USE` macro in which the Op is used to make sure that it is linked. + --- # Backward Module (1/2) ### Create Backward Operator @@ -195,13 +197,14 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) --- # Backward Module (2/2) ### Build Backward Network -- **Input**: graph of forward operators -- **Output**: graph of backward operators +- **Input**: a graph of forward operators +- **Output**: a graph of backward operators - **Corner cases in construction** - Shared Variables => insert an `Add` operator to combine gradients - No Gradient => insert a `fill_zero_grad` operator - Recursive NetOp => call `Backward` recursively - RNN Op => recursively call `Backward` on stepnet + - RNN Op => recursively call `Backward` on stepnet --- @@ -211,10 +214,10 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) * Only dims and data pointers are stored in `Tensor`. * All operations on `Tensor` are written in `Operator` or global functions. * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) -* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`. +* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. * `step_scopes` in RNN is a variable and not a tensor. -* `Scope` is where variables are stores. - * map +* `Scope` is where variables are stored. + * map * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. --- @@ -242,7 +245,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) --- # Control the migration quality - Compare the performance of migrated models with old ones. -- Follow the google C++ style +- Follow the google C++ style guide. - Build the automatic workflow of generating Python/C++ documentations. - The documentation of layers and ops should be written inside the code. - Take the documentation quality into account when submitting pull requests. diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md new file mode 100644 index 0000000000000000000000000000000000000000..9f1ce4bae7b393cb9f04909e5e4917b8d660771c --- /dev/null +++ b/doc/design/register_grad_op.md @@ -0,0 +1,92 @@ +# Design Doc: Gradient Operators Registration + + +## The Problem Posed + +Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance. + +However, we noticed two problems with the current deisgn: + +1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. + +1. Some operator's gradient computation requires more than one gradient operators. For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator. So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation. + +## The Current Implementation + +The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is + +```cpp +struct OpInfo { + std::function creator_; + std::string grad_op_type_; + ... +}; + +map OpInfoMap; + +OperatorBase* CreateGradientOperator(const OperatorBase& op) { + return OpInfoMap.at(op.Type()).creator_(...); +} +``` + +## Proposed Solution + +The mapping relationship between an operator and its gradient operators is a function. The interface of that function is: + +```cpp +// (OpDesc) --> vector +std::function(const OpDescBind&)>; +``` + +The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast. + +The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + std::function>(const OpDescBind&)> grad_op_maker_; + ... +}; +``` + +The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators. + +We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is + +```cpp +class GradOpDescMakerBase { +public: + GradOpDescMakerBase(const OpDescBind& ); + virtual std::vector> operator()()const = 0; +}; +``` + +We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by + +```cpp +using GradOpMaker = ...; +std::function(const OpDescBind&)> func; +func = [] (const OpDescBind& fwd_op) { + GradOpMaker maker(fwd_op); + return maker(); +}; +``` + +We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. + +We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. + +The user interface should be + +```cpp +vector MinusOpGradMaker(OpDesc) {...} +REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker); +// Developers can still manually implement gradient operator. +REGISTER_OPERATOR(minus_grad, MinusGradOp); +``` + +The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside. + +```cpp +REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp); +``` diff --git a/doc/design/scope.md b/doc/design/scope.md index b1f9bb4378eb5ec6926f1e53f7c1f4fd5674064c..4da76eebb74abcd26ec2b8671399e6bc4fb58574 100644 --- a/doc/design/scope.md +++ b/doc/design/scope.md @@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`. ```cpp class Scope { public: - Variable* NewVar(const std::string& name); + Variable* Var(const std::string& name); const Variable* FindVar(const std::string& name) const; private: @@ -98,7 +98,7 @@ class Scope { Variable* FindVar(const std::string& name) const; // return if already contains same name variable. - Variable* NewVar(const std::string& name); + Variable* Var(const std::string& name); private: std::shared_ptr parent_; @@ -107,7 +107,7 @@ class Scope { ``` ## Only scope can create a variable -To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`. +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. ## When scope destroyed, all variables inside this scope should be destroyed together @@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar ## Orthogonal interface -`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily. +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md new file mode 100644 index 0000000000000000000000000000000000000000..9e6f3b20cbcdc55e481fbe7bf5fa555d8b3c3d45 --- /dev/null +++ b/doc/design/selected_rows.md @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators should be written to support `SelectedRows`. They are: + +1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md index a0419ec002159893b035fae1300fce489e68936a..37e4f7b90f94fa3eb015e733999cd84c96b2239c 100644 --- a/doc/design/tensor_array.md +++ b/doc/design/tensor_array.md @@ -1,39 +1,250 @@ # Design for TensorArray +This design doc presents the necessity of a new C++ class `TensorArray`. +In addition to the very simple C++ implementation + +```c++ +class TensorArray { + public: + explicit TensorArray(const LoDTensor&); + explicit TensorArray(size_t size); + + private: + vector values_; +}; +``` + +We also need to expose it to PaddlePaddle's Python API, +because users would want to use it with our very flexible operators `WhileLoop`. +An example for a RNN based on dynamic operators is + +```python +input = pd.data(...) +num_steps = Var(12) + +TensorArray states(size=num_steps) +TensorArray step_inputs(unstack_from=input) +TensorArray step_outputs(size=num_steps) + +W = Tensor(...) +U = Tensor(...) +default_state = some_op() + +step = Var(1) + +wloop = paddle.create_whileloop(loop_vars=[step]) +with wloop.frame(): + wloop.break_if(pd.equal(step, num_steps) + pre_state = states.read(step-1, default_state) + step_input = step_inputs.read(step) + state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) + states.write(step, state) + step_outputs.write(step, state) # output state + step.update(state+1) + +output = step_outputs.stack() +``` + +## Background +Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. + +An RNN can be implemented with the following pseudocode + +```c++ +Array states; +Array input_segments; +Array output_segments; +Parameter W, U; + +step = 1 +seq_len = 12 +while_loop { + if (step == seq_len) break; + states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); + output_segments[step] = states[step] // take state as output + step++; +} +``` +According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. + +Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. + + +Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). +Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. + +As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. + +The implementation is similar to `recurrent_op`. +The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** + + +Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, +the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. + +## Why `TensorArray` +The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. + +The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. + +So there should be an array-like container, which can store the segments of a tensor or LoD tensor. + +**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . +This is where the notion of `TensorArray` comes from. + +## Introduce TensorArray to uniform all the three RNNs TensorArray as a new concept is borrowed from TensorFlow, it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, -such as `RecurrentGradientMachine`. +such as `recurrent_op`, `RecurrentGradientMachine`. In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), `TensorArray` is used to segment inputs and store states in all time steps. By providing some methods similar to a C++ array, -the definition of some state-based dynamic models such as RNN could be more natural and highly flexible. - -## Dynamic-Related Methods -Some basic methods should be proposed as follows: - -### stack() -Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`. -### unstack(axis=0) -Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. -### concat() -Return the values in the `TensorArray` as a concatenated Tensor. -### write(index, value, data_shared=true) -Write value into index of the TensorArray. -### read(index) -Read the value at location `index` in the `TensorArray`. -### size() -Return the number of values. +the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. + +## Dynamic-operations on TensorArray + +`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented + +```python +# several helper operators for TensorArray +def tensor_array_stack(ta, tensor): + ''' + get a tensor array `ta`, return a packed `tensor`. + ''' + pass + +def tensor_array_unstack(tensor, ta): + ''' + get a `tensor`, unstack it and get a tensor array `ta`. + ''' + pass + +def tensor_array_write(ta, index, tensor, data_shared): + ''' + get a `tensor` and a scalar tensor `index`, write `tensor` into index-th + value of the tensor array `ta`. + `data_shared` is an attribute that specifies whether to copy or reference the tensors. + ''' + pass + +def tensor_array_read(ta, index, tensor): + ''' + get a tensor array `ta`, a scalar tensor `index`, read the index-th value of + `ta` and return as the `tensor`. + ''' + pass + +def tensor_array_size(ta, tensor): + ''' + get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. + ''' + pass +``` + +It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, +for example + +```python +class TensorArray: + def __init__(self, name): + self.name = name + self.desc = TensorArrayDesc() + + def stack(self, name=None): + ''' + Pack the values in a `TensorArray` into a tensor with rank one higher + than each tensor in `values`. + `stack` can be used to split tensor into time steps for RNN or whileloop. + + @name: str + the name of the variable to output. + ''' + tensor = Var(name) + tensor_array_stack(self.name, tensor) + return tensor + + def unstack(self, input): + ''' + Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. + `unstack` can be used to concatenate all the time steps for RNN or whileloop. + + @input: str + the name of input tensor + ''' + tensor_array_unstack(tensor, self.name) + + def write(self, index, value, data_shared=True): + ''' + Write value into index of the TensorArray. + If `data_shared` is set to True, than the index-th value in TensorArray will + be shared with the tensor passed in. + + @index: str + name of a scalar tensor + @value: str + name of a tensor + @data_shared: bool + ''' + tensor_array_write(self.name, index, value, data_shared) + + def read(self, index, output): + ''' + Read the value at location `index` in the `TensorArray`. + + @index: str + name of a scalar tensor + @output: + name of a output variable + ''' + tensor_array_read(self.name, index, output) + + + def size(self, output): + ''' + Return the number of values. + + @output: str + name of a scalar tensor + ''' + tensor_array_size(self.name, output) +``` ## LoDTensor-related Supports -The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, -because each step of RNN could only take a tensor-represented batch of data as input, +The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. + +Since each step of RNN can only take a tensor-represented batch of data as input, some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. -Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`. +Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, +these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. + +Some definitions are like + +```python +def unpack(level): + ''' + Split LodTensor in some `level` and generate batches, if set `sort_by_length`, + will sort by length. -With these two methods, a variant-sentence-RNN can be implemented like + Returns: + - a new `TensorArray`, whose values are LodTensors and represents batches + of data. + - an int32 Tensor, which stores the map from the new batch's indices to + original LoDTensor + ''' + pass + +def pack(level, indices_map): + ''' + Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` + and `level` and `indices_map`. + ''' + pass +``` + +With these two methods, a varience-length sentence supported RNN can be implemented like ```c++ // input is the varient-length data @@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map); ``` the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. - - -some details are as follows. - -### unpack(level, sort_by_length) -Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length. - -Returns: - -- a new `TensorArray`, whose values are LodTensors and represents batches of data. -- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor -### pack(level, indices_map) -Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`. diff --git a/doc/design/test.dot b/doc/design/test.dot new file mode 100644 index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a --- /dev/null +++ b/doc/design/test.dot @@ -0,0 +1,35 @@ + +digraph Test { + z -> generator -> G_img; + G_img -> discriminator -> D_f -> d_loss_f; + label0 -> d_loss_f -> d_loss; + + img -> discriminator -> D_t -> d_loss_t; + label1 -> d_loss_t -> d_loss; + + d_loss -> d_loss_t[color=red, style=dashed]; + d_loss -> d_loss_f[color=red, style=dashed]; + d_loss_t -> D_t[color=red, style=dashed]; + d_loss_f -> D_f[color=red, style=dashed]; + D_t -> discriminator[color=red, style=dashed]; + D_f -> discriminator[color=red, style=dashed]; + + D_f -> g_loss; + label2 -> g_loss; + + g_loss -> D_f[color=green, style=dashed]; + D_f -> discriminator[color=green, style=dashed]; + discriminator -> G_img[color=green, style=dashed]; + G_img -> generator[color=green, style=dashed]; + + discriminator [color=red, shape=box]; + generator [color=green, shape=box]; + z [shape=diamond]; + img [shape=diamond]; + label0 [shape=diamond]; + label1 [shape=diamond]; + label2 [shape=diamond]; + + d_loss [color=red]; + g_loss [color=green]; +} diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png new file mode 100644 index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55 Binary files /dev/null and b/doc/design/test.dot.png differ diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md index bfbbdd0578ebc69ea4b49ade9b041573a9e9ad55..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f 100644 --- a/doc/design/var_desc.md +++ b/doc/design/var_desc.md @@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep ## Definition of VarDesc -A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it. +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. ```proto message VarDesc { required string name = 1; - optional LoDTensorDesc lod_tensor = 2; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; } ``` -## Definition of LodTensorDesc +## Definition of TensorDesc ```proto enum DataType { @@ -38,87 +45,25 @@ enum DataType { FP64 = 6; } -message LoDTensorDesc { +message TensorDesc { required DataType data_type = 1; - repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - optional int32 lod_level = 3 [default=0]; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] } ``` -## Definition of Variable in Python - -In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable. - -```python -image = Variable(dims=[-1, 640, 480]) -# fc1 and fc2 are both Variable -fc1 = layer.fc(input=image, output_size=10) -fc2 = layer.fc(input=fc1, output_size=20) -``` -### what should class `Variable` Have -1. `name`.a name of string type is used to mark the value of the Variable. -1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator. -1. `operator`. Variable should record which operator produce itself. The reaon is: - - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable. - -In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph. - -```python -import VarDesc -import LoDTensorDesc -import framework - -def AddInitialOperator(variable, initializer): - # add an initialize Operator to block to init this Variable - -class Variable(object): - def __init__(self, name, dims, type, initializer): - self._block = get_default_block() - self._name = name - self.op = None - - tensor_desc = LoDTensorDesc(data_type=type, dims=dims) - _var_desc = VarDesc(name=name, lod_tensor=tensor_desc) - self._var = framework.CreateVar(_var_desc) - self._block.add_var(self) +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). - # add initial op according to initializer - if initializer is not None: - AddInitialOperator(self, initializer) - - def dims(self): - return self._var.dims() - - def data_type(self): - return self._var.data_type() +## Definition of LodTensorDesc - def to_proto(self): - pass +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} ``` -Then we can use this Variable to create a fc layer in Python. +A LoDTensorDesc contains a tensor and a lod_level. -```python -import paddle as pd - -def flatten_size(X, num_flatten_dims): - prod = 1 # of last num_flatten_dims - for i in xrange(num_flatten_dims): - prod = prod * X.dims[-i-1] - return prod - -def layer.fc(X, output_size, num_flatten_dims): - W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size]) - b = Variable(pd.random_uniform(), type=FP32, dims=[output_size]) - out = Variable(type=FP32) - y = operator.fc(X, W, b, output=out) # fc will put fc op input into out - pd.InferShape(y) - return out - -x = Variable(dims=[-1, 640, 480]) -y = layer.fc(x, output_size=100) -z = layer.fc(y, output_size=200) +## Definition of Variable in Python -paddle.eval(targets=[z], ...) -print(z) -``` +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst index 4d684cf8ad5a8082cf31fb27027119b3d3e700b6..63fa161fafed0f3a8ec8799af21304cbec62d813 100644 --- a/doc/howto/deep_model/rnn/rnn_config_cn.rst +++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst @@ -21,7 +21,7 @@ wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py