diff --git a/CMakeLists.txt b/CMakeLists.txt index db3c3b8e2069f9ae5ad02286b59decf8fe764c2d..0ab80987b3ad6c4793ceeac1bf3808d2e87fbd5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) +option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) + +# PY_VERSION +if(NOT PY_VERSION) + set(PY_VERSION 2.7) +endif() # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -146,6 +152,7 @@ endif() ######################################################################################## include(external/mklml) # download mklml package +include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -232,6 +239,10 @@ if(WITH_MKLML) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) endif() +if(WITH_LIBXSMM) + list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS}) +endif() + if(WITH_MKLDNN) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() @@ -271,7 +282,3 @@ if(WITH_DOC) find_python_module(recommonmark REQUIRED) add_subdirectory(doc) endif() - -if (WITH_CONTRIB) - add_subdirectory(paddle/contrib) -endif() diff --git a/Dockerfile b/Dockerfile index 48c750358cfcb227667c429f19befcaa2f51ebbd..402adee2ea2822250ebc8f6229fd6a44545d58e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ RUN pip install -r /root/requirements.txt diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 94ea7bd6aca7c9595037a2dacc5e36d4c77827e7..f8aed5a5e06c5e29dbdfb5db9f2ea0344c7eed6d 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, # generate fake: if args.use_fake_data: for var in feed_var_list: - v = startup_prog.global_block().clone_variable(var) + v = startup_prog.global_block()._clone_variable(var) var.persistable = True v.persistable = True diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake new file mode 100644 index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0 --- /dev/null +++ b/cmake/external/libxsmm.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) + +IF(NOT WITH_LIBXSMM) + return() +ENDIF() + +IF(WIN32 OR APPLE OR ANDROID OR IOS) + MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.") + SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) + return() +ENDIF() + +INCLUDE (ExternalProject) + +SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) +SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) +SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) +SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) +SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" + "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +ExternalProject_Add( + extern_libxsmm + GIT_REPOSITORY "https://github.com/hfp/libxsmm.git" + GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" + PREFIX ${LIBXSMM_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install + INSTALL_COMMAND "" +) +ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") +include_directories(${LIBXSMM_INCLUDE_DIR}) +ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) +ADD_DEPENDENCIES(libxsmm extern_libxsmm) +LIST(APPEND external_project_dependencies libxsmm) + diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index ce6a88b51dc98ac46dd3935f12658d60d364ba8c..56024edf5be092f81ed893633a8e7cafc8c8d429 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -121,6 +121,11 @@ ELSE() TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") +IF(WITH_LIBXSMM) + TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS}) + ADD_DEPENDENCIES(cblas extern_libxsmm) +ENDIF() + IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) LIST(APPEND external_project_dependencies cblas) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,9 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp 2.7) -FIND_PACKAGE(PythonLibs 2.7) +FIND_PACKAGE(PythonInterp ${PY_VERSION}) +FIND_PACKAGE(PythonLibs ${PY_VERSION}) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. ADD_LIBRARY(python SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c6979713231f631f8757e4139d6f685d4554b54e..e2c58cd56055455e7fedc598ca8f56183d4b51dc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -138,25 +138,24 @@ copy(memory_lib set(inference_deps paddle_fluid_shared paddle_fluid) -if(WITH_CONTRIB) - message(STATUS "installing contrib") - set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference") - if (WITH_ANAKIN AND WITH_GPU) - copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api - SRCS - ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api - ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release - DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin) - list(APPEND inference_deps contrib_anakin_inference_lib) - endif() - - copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared - SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h - ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api* - DSTS ${contrib_dst_dir} ${contrib_dst_dir}) - list(APPEND inference_deps contrib_inference_lib) +set(module "inference/api") +if (WITH_ANAKIN AND WITH_GPU) + copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api + SRCS + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api + ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release + DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin) + list(APPEND inference_deps anakin_inference_lib) endif() +copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared + SRCS ${src_dir}/${module}/paddle_inference_api.h + ${src_dir}/${module}/demo_ci + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api* + DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference +) +list(APPEND inference_deps inference_api_lib) + set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md index 265732a348ea77d21005e335390d99abcdfbd045..83af4e55485c079265d3f2b1e15070825b532c02 100644 --- a/doc/fluid/design/modules/python_api.md +++ b/doc/fluid/design/modules/python_api.md @@ -98,13 +98,13 @@ class Block(objects): def append_operator(self, ...): self.ops.append(Operator(self, ...)) - def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. + def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. self.ops.prepend(Operator(self, ...)) ``` `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. -`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. +`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. ### Operator diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md index 58aa73b8cd38d01e2426278a3479714e4fb6a3b0..749cf7693c75696feb17f8556224ed03649baa80 100644 --- a/doc/fluid/howto/performance/error_clip.md +++ b/doc/fluid/howto/performance/error_clip.md @@ -78,7 +78,7 @@ def error_clip_callback(block, context): op_desc = block.desc.op(block.desc.op_size() - 1) for grad_n in filter(lambda n: grad_to_var.has_key(n), op_desc.output_arg_names()): - fwd_var = block.var_recursive(grad_to_var[grad_n]) + fwd_var = block.__var_recursive(grad_to_var[grad_n]) error_clip = getattr(fwd_var, "error_clip", None) if not (error_clip is None or isinstance(error_clip, BaseErrorClipAttr)): diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst index 70c5c524aaf0a9ae003bf4340c3f268c225d4419..5813509dce46677444f0234db8e0eaa4f113e3a0 100644 --- a/doc/v2/api/index_en.rst +++ b/doc/v2/api/index_en.rst @@ -4,7 +4,6 @@ API .. toctree:: :maxdepth: 1 - overview.rst model_configs.rst data.rst run_logic.rst diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst index 6421c5308271c2508597d849c79709255caf349a..d0dacb104f148c2aeb323365cbd6f014ae00ed5a 100644 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ b/doc/v2/build_and_install/build_from_source_cn.rst @@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像 docker build -t paddle:dev . # 3. 执行下面的命令编译CPU-Only的二进制 - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build + docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步) docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build -注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。 +注: + +- 上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。 + +- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI `__. +PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`. 编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst index b08b45d43ec7f1deb2889832079a731ee724a44c..664b68da8b7dd3e005ebf3ec34de77729e5ab355 100644 --- a/doc/v2/build_and_install/build_from_source_en.rst +++ b/doc/v2/build_and_install/build_from_source_en.rst @@ -36,13 +36,18 @@ If you don't wish to use docker,you need to install several compile dependenci # 2. Optional: build development docker image from source docker build -t paddle:dev . # 3. Run the following command to build a CPU-Only binaries - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build + docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2) docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build -NOTE: The above command try to mount the current working directory (root directory of source code) +NOTE: + +- The above command try to mount the current working directory (root directory of source code) into :code:`/paddle` directory inside docker container. +- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI `__. +Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` . + When the compile finishes, you can get the output whl package under build/python/dist, then you can choose to install the whl on local machine or copy it to the target machine. diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt deleted file mode 100644 index 4b19256ef4533a09162edf907f6cd51146517e46..0000000000000000000000000000000000000000 --- a/paddle/contrib/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -add_subdirectory(inference) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 91ba101edb65cd45bd5e37a0c6ad25e515593a81..66e0345c299730c113ffbdc8dd3c1fa32f872f3d 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -118,7 +118,7 @@ class Float16Transpiler: for var in self.block.vars.keys(): if var not in args: - self.block.remove_var(var) + self.block._remove_var(var) def _modify_feed_fetch(self): ''' @@ -165,7 +165,7 @@ class Float16Transpiler: dtype=core.VarDesc.VarType.FP16, shape=var.shape, persistable=var.persistable) - self.block.insert_op( + self.block._insert_op( i + 1, type="cast", inputs={"X": var}, @@ -188,7 +188,7 @@ class Float16Transpiler: persistable=var.persistable) find_op(var) var.op.rename_output(var_name, tmp_var_name) - self.block.insert_op( + self.block._insert_op( i, type="cast", inputs={"X": tmp_var}, @@ -253,4 +253,4 @@ class Float16Transpiler: # old var will be replaced by the fp16 var in program desc self.input_map[var.name] = fp16_var_name - self.block.remove_var(var.name) + self.block._remove_var(var.name) diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt deleted file mode 100644 index ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36..0000000000000000000000000000000000000000 --- a/paddle/contrib/inference/demo/CMakeLists.txt +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -inference_api_test(simple_on_word2vec ARGS test_word2vec) - -option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF) -if(NOT WITH_INFERENCE_DEMO) - return() -endif() - -set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo") -set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F) - -function(inference_download_test_demo TARGET) - if (NOT WITH_TESTING) - return() - endif() - set(options "") - set(oneValueArgs URL) - set(multiValueArgs SRCS) - cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}") - message(STATUS "inference demo ${test_dir}") - - if(NOT EXISTS "${test_dir}") - message(STATUS "Download ${TARGET} model from ${tests_URL}") - execute_process(COMMAND bash -c "mkdir -p ${test_dir}") - execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}") - execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz") - endif() - - cc_test(${TARGET} SRCS "${tests_SRCS}" - DEPS paddle_inference_api paddle_fluid - ARGS --data=${test_dir}/data.txt - --modeldir=${test_dir}/model - --refer=${test_dir}/result.txt) -endfunction() - -# disable mobilenet test -#inference_download_test_demo(mobilenet_inference_demo -# SRCS vis_demo.cc -# URL ${URL_ROOT}mobilenet.tar.gz) -inference_download_test_demo(se_resnext50_inference_demo - SRCS vis_demo.cc - URL ${URL_ROOT}se_resnext50.tar.gz) -inference_download_test_demo(ocr_inference_demo - SRCS vis_demo.cc - URL ${URL_ROOT}ocr.tar.gz) diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md deleted file mode 100644 index f1d256660299a68dc5d9d73dbe4a401a0e7d9680..0000000000000000000000000000000000000000 --- a/paddle/contrib/inference/demo/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Infernce Demos - -Input data format: - -- Each line contains a single record -- Each record's format is - -``` -\t -``` - -Follow the C++ codes in `vis_demo.cc`. - -## MobileNet - -To execute the demo, simply run - -```sh -./mobilenet_inference_demo --modeldir --data -``` - -## SE-ResNeXt-50 - -To execute the demo, simply run - -```sh -./se_resnext50_inference_demo --modeldir --data -``` - -## OCR - -To execute the demo, simply run - -```sh -./ocr_inference_demo --modeldir --data -``` diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec new file mode 100644 index 0000000000000000000000000000000000000000..5c17ec4fc3909b002919b27d9967e714d6115cea --- /dev/null +++ b/paddle/fluid/API.spec @@ -0,0 +1,429 @@ +paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False)) +paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None) +paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) +paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) +paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) +paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None) +paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True)) +paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None) +paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None)) +paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None) +paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None) +paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None) +paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None) +paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None) +paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10)) +paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False)) +paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) +paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) +paddle.fluid.InferenceTranspiler.__init__ +paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) +paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.DistributeTranspilerConfig.__init__ +paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0)) +paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)) +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None +paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) +paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) +paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) +paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) +paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)) +paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)) +paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None) +paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None)) +paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')) +paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)) +paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None)) +paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None)) +paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid')) +paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) +paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) +paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None)) +paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, True)) +paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) +paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False)) +paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) +paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) +paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) +paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) +paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) +paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) +paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) +paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) +paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) +paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) +paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) +paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) +paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) +paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) +paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) +paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)) +paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) +paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) +paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) +paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) +paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR')) +paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) +paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) +paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) +paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True)) +paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)) +paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) +paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)) +paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)) +paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)) +paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)) +paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)) +paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)) +paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)) +paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) +paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None) +paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')) +paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) +paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) +paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.read_input ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.ParallelDo.write_output ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')) +paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)) +paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.scale ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_add ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_div ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_sub ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_max ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_min ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elementwise_pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.exp ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.tanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.tanh_shrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sqrt ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.abs ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.ceil ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.floor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.cos ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sin ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.round ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.reciprocal ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.square ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.softplus ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.softsign ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.brelu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.leaky_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.soft_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) +paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) +paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) +paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) +paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) +paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) +paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) +paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)) +paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) +paddle.fluid.transpiler.InferenceTranspiler.__init__ +paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ +paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False)) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) +paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) +paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None) +paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,)) +paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,)) +paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) +paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) +paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06)) +paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) +paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0)) +paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) +paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000)) +paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) +paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) +paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) +paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core.LoDTensor) -> None +paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool +paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None +paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None +paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None +paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] +paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None +paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None +paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None +paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None +paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None +paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)) +paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,)) +paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)) +paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None) +paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)) +paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None) +paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None) +paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')) +paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None) +paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) +paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) +paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None +paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None +paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope +paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index b82c2ef4082110f1621eb38d50361396511a4825..6f5d4471a97cc4efc73b9df68040ab9eccde0b1c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -276,13 +276,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } } - // Insert BCast Ops - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); + bool use_gpu = false; +#ifdef PADDLE_WITH_CUDA + use_gpu = nccl_ctxs_ != nullptr; +#endif + + if (use_gpu || + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + // Insert BCast Ops + for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(&result, bcast_name, dev_id); + } } } + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. @@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - - for (auto &varname : op.InputArgumentNames()) { - int dev_id = GetVarDeviceID(varname); - if (dev_id != -1) { - return dev_id; - } + int op_role = boost::get( + op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + if (op_role != static_cast(framework::OpRole::kOptimize)) { + return -1; } - return -1; + auto param_grad = boost::get>( + op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(), + param_grad[0]); + return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index eb4e7ec52f907f9403e21ec2734d61824f51a58b..1d80bab90f513139f807b57258177c6b2ac53ac0 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include #include #include #include "paddle/fluid/framework/executor.h" @@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } } } + std::vector fetch_data; + std::exception_ptr eptr; + try { + fetch_data = underlying_executor_->Run(fetch_tensors); + } catch (...) { + eptr = std::current_exception(); + } - auto fetch_data = underlying_executor_->Run(fetch_tensors); drop_scope_counter_ += 1; if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { @@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( scope->DeleteScope(local_scope); } } - return fetch_data; + if (eptr) { + std::rethrow_exception(eptr); + } else { + return fetch_data; + } } } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 99b10254a7961bf7b27b256acaece573a71c4115..07097c7e75c6ce638549716cd6523f387cdefd92 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( set.clear(); }; + // Clean run context + run_op_futures_.clear(); + exception_.reset(); + // Step 3. Execution while (!pending_vars.empty()) { // 1. Run All Ready ops @@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - std::lock_guard l(exception_mu_); + std::unique_lock l(exception_mu_); if (exception_) { + l.unlock(); + for (auto &run_op_future : run_op_futures_) { + run_op_future.wait(); + } + l.lock(); std::exception *exp = exception_.get(); if (dynamic_cast(exp)) { auto e = *static_cast(exp); - exception_.reset(); throw e; } else if (dynamic_cast(exp)) { auto e = *static_cast(exp); - exception_.reset(); throw e; } else { LOG(FATAL) << "Unknown exception."; @@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp( } }; if (pool_) { - pool_->enqueue(op_run); + run_op_futures_.emplace_back(pool_->enqueue(op_run)); } else { op_run(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index c69e0487e2e503a0d445300aa2fd6bb9c30b06c9..09973b7a72881464ad9e7776d4aad3d2261a118d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { private: ExecutionStrategy strategy_; + // use std::list because clear(), push_back, and for_each are O(1) + std::list> run_op_futures_; }; } // namespace details diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3..9a72e1baa34274201c40bd83a7aace549a7fc6ae 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -45,6 +45,7 @@ class ParallelExecutorPrivate { #endif bool own_local_scope_; bool use_cuda_; + bool use_all_reduce_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->use_all_reduce_ = + build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + + if (!member_->use_all_reduce_) { + PADDLE_ENFORCE(places.size() > 1, + "If you set build_strategy.reduce with 'Reduce'," + "the number of places must be greater than 1."); + } // Step 1. Bcast the params to devs. // Create local scopes @@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor( } if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { - BCastParamsToGPUs(bcast_vars); + BCastParamsToDevices(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA."); #endif } @@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor( member_->places_, std::move(member_->executor_))); } -void ParallelExecutor::BCastParamsToGPUs( +void ParallelExecutor::BCastParamsToDevices( const std::unordered_set &vars) const { - // the the initializing bcast, all vars would be bcast from device(0), + // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. bool initializing = builder_.get() == nullptr ? true : false; @@ -202,12 +211,23 @@ void ParallelExecutor::BCastParamsToGPUs( #endif } else { platform::CPUPlace cpu; - for (size_t i = 1; i < member_->places_.size(); ++i) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + if ((initializing && i == 0) || + (!initializing && static_cast(i) == var_dev_id)) + continue; + auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + + // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. + if (member_->use_all_reduce_ || member_->use_cuda_ || + var == "@LR_DECAY_COUNTER@") { + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); + } else { + t->ShareDataWith(main_tensor); + } } } } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 058f83f07c26224e3180d140630c08a24c40cd80..ffb9934a2d702b2bf6db7ad75a6bf9867e1e9901 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -66,7 +66,7 @@ class ParallelExecutor { void Run(const std::vector &fetch_tensors, const std::string &fetched_var_name); - void BCastParamsToGPUs(const std::unordered_set &vars) const; + void BCastParamsToDevices(const std::unordered_set &vars) const; private: ParallelExecutorPrivate *member_; diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 6c4432cb7a70853e19460b1980d621c02caed970..a8d04feb42456607159bcbede0574fe90dfe995c 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped }; class ReaderBase { public: - void ReadNext(std::vector* out); + virtual void ReadNext(std::vector* out); - void Shutdown(); + virtual void Shutdown(); - void Start(); + virtual void Start(); // Return the readers which are the end of decorating chain. Basically // they are readers just before read op. @@ -42,7 +42,7 @@ class ReaderBase { virtual ~ReaderBase(); protected: - virtual void ReadNextImpl(std::vector* out) = 0; + virtual void ReadNextImpl(std::vector* out) {} virtual void ShutdownImpl() {} diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 1895aea7f98cb1ad12b2ce16545339252349ea37..c9eff0fc28c5ff52f902d3d5a0ebb37fa7619e9c 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,4 +1,11 @@ -set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor ) +# analysis and tensorrt must be added before creating static library, +# otherwise, there would be undefined reference to them in static library. +add_subdirectory(analysis) +if (TENSORRT_FOUND) + add_subdirectory(tensorrt) +endif() + +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api @@ -7,12 +14,14 @@ cc_library(paddle_fluid_api get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) -if(WITH_CONTRIB) - set(fluid_modules "${fluid_modules}" paddle_inference_api) -endif() - # Create static library cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api) +if(NOT APPLE) + # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") + set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() + # Create shared library cc_library(paddle_fluid_shared SHARED SRCS io.cc @@ -29,9 +38,4 @@ if(WITH_TESTING) # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) endif() - -add_subdirectory(analysis) - -if (TENSORRT_FOUND) - add_subdirectory(tensorrt) -endif() +add_subdirectory(api) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index d09bf3ed161703b0cf273522921e157c7360a0bc..bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const { return dot.Build(); } +std::string DataFlowGraph::HumanReadableInfo(bool show_values, + bool show_functions) const { + std::stringstream values, functions; + for (auto &n : nodes.nodes()) { + if (show_values && n->IsValue()) { + values << n->repr() << "\n"; + } + if (show_functions && n->IsFunction()) { + functions << n->repr() << "\n"; + } + } + return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str(); +} + // // NodesBFSIterator // @@ -146,7 +160,7 @@ bool GraphTraits::NodesBFSIterator::operator==( if ((!queue_.empty()) && (!other.queue_.empty())) { return queue_.front() == other.queue_.front() && visited_.size() == other.visited_.size(); // here need to check the - // equality of queue and + // equality of queue and // visited. Just a light but week implementation. } return false; @@ -208,6 +222,76 @@ Node *GraphTraits::NodesDFSIterator::operator->() { return stack_.top(); } +GraphTraits::NodesTSIterator::NodesTSIterator( + const std::vector &source) { + PADDLE_ENFORCE(!source.empty(), + "Start points of topological sorting should not be empty!"); + std::unordered_set visited; + std::unordered_set to_visit{source.begin(), source.end()}; + + std::vector inlink_visited; + while (!to_visit.empty()) { + std::vector queue(to_visit.begin(), to_visit.end()); + for (auto *p : queue) { + inlink_visited.clear(); + + std::copy_if(p->inlinks.begin(), p->inlinks.end(), + std::back_inserter(inlink_visited), + [&](Node *x) { return visited.count(x); }); + + if (inlink_visited.size() == p->inlinks.size()) { + sorted_.push_back(p); + for (auto *_ : p->outlinks) { + if (!visited.count(_)) { + to_visit.insert(_); + } + } + + to_visit.erase(p); + visited.insert(p); + } + } + } +} + +GraphTraits::NodesTSIterator::NodesTSIterator( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) + : sorted_(other.sorted_), cursor_(other.cursor_) {} + +Node &GraphTraits::NodesTSIterator::operator*() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return *sorted_[cursor_]; +} + +paddle::inference::analysis::GraphTraits::NodesTSIterator + &GraphTraits::NodesTSIterator::operator++() { + if (++cursor_ >= sorted_.size()) { + sorted_.clear(); + cursor_ = 0; + } + return *this; +} +paddle::inference::analysis::GraphTraits::NodesTSIterator & +GraphTraits::NodesTSIterator::operator=( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) { + cursor_ = other.cursor_; + sorted_ = other.sorted_; + return *this; +} + +bool GraphTraits::NodesTSIterator::operator==( + const paddle::inference::analysis::GraphTraits< + DataFlowGraph>::NodesTSIterator &other) { + return sorted_ == other.sorted_ && cursor_ == other.cursor_; +} + +Node *GraphTraits::NodesTSIterator::operator->() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return sorted_[cursor_]; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index a4fefc83e0c551d52bec87299bcbc966e7a2dbf7..5dd914d1971bfb5bcc0b1db41d73e2b67120bc06 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -48,6 +48,9 @@ struct DataFlowGraph { // Output a DOT graph file for debug. std::string DotString() const; + std::string HumanReadableInfo(bool show_values = true, + bool show_functions = true) const; + private: // Remove duplicate edges and so on. void Clean(); @@ -107,6 +110,32 @@ struct GraphTraits { std::unordered_set visited_; }; + // Topological sorting iterator on nodes. + struct NodesTSIterator + : public std::iterator { + NodesTSIterator() = default; + explicit NodesTSIterator(const std::vector &source); + NodesTSIterator(NodesTSIterator &&other) + : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { + other.cursor_ = 0; + } + NodesTSIterator(const NodesTSIterator &other); + + Node &operator*(); + NodesTSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesTSIterator &operator=(const NodesTSIterator &other); + bool operator==(const NodesTSIterator &other); + bool operator!=(const NodesTSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::vector sorted_; + int cursor_{0}; + }; + explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} // default use BFS to visit the nodes. @@ -119,17 +148,24 @@ struct GraphTraits { iterator_range nodes_in_DFS() { return iterator_range(nodes_dfs_begin(), nodes_dfs_end()); } + iterator_range nodes_in_TS() { + return iterator_range(nodes_ts_begin(), nodes_ts_end()); + } private: NodesBFSIterator nodes_bfs_begin() { return NodesBFSIterator(graph_->inputs); } NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } + NodesDFSIterator nodes_dfs_begin() { return NodesDFSIterator(graph_->inputs); } NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } + NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); } + NodesTSIterator nodes_ts_end() { return NodesTSIterator(); } + private: DataFlowGraph *graph_; }; diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 9d7cceeb65888b8ba3fdf39e88fc2877abd82d11..7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) { auto dfg = ProgramDescToDFG(desc); dfg.Build(); - for (auto* in : dfg.inputs) { + for (auto *in : dfg.inputs) { LOG(INFO) << "inputs: " << in->name() << " " << static_cast(in->type()); } - for (auto* out : dfg.outputs) { + for (auto *out : dfg.outputs) { LOG(INFO) << "outputs: " << out->name() << " " << static_cast(out->type()); } @@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) { ASSERT_EQ(count, dfg.nodes.size()); } +// Topological sorting. +/* + * Graph topology + * inputs: 0, 1, 2 + * 0 -> 4 + * 0 -> 5 + * 1 -> 6 + * 2 -> 7 + * 4 -> 5 + * 4 -> 7 + * 4 -> 3 + * 7 -> 3 + */ +TEST(DataFlowGraph, TS) { + DataFlowGraph graph; + + for (int i = 0; i < 8; i++) { + auto *node = graph.nodes.Create(Node::Type::kValue); + node->SetName("node-" + std::to_string(i)); + } + + auto add_link = [&](int i, int j) { + Node *source = graph.nodes.GetMutable(i); + Node *target = graph.nodes.GetMutable(j); + target->inlinks.push_back(source); + source->outlinks.push_back(target); + }; + + graph.inputs.push_back(graph.nodes.GetMutable(0)); + graph.inputs.push_back(graph.nodes.GetMutable(1)); + graph.inputs.push_back(graph.nodes.GetMutable(2)); + + add_link(0, 4); + add_link(0, 5); + add_link(1, 6); + add_link(2, 7); + add_link(4, 5); + add_link(4, 7); + add_link(4, 3); + add_link(7, 3); + + auto its = GraphTraits(&graph).nodes_in_TS(); + std::vector sorted_ids; + for (auto it = its.begin(); it != its.end(); ++it) { + LOG(INFO) << it->name(); + sorted_ids.push_back(it->id()); + } + + // Assert a occurs prior to b in the sorted_ids. + auto assert_positive_sequence_pair = [&](int a, int b) { + auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a); + auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b); + ASSERT_LT(a_offset, b_offset); + }; + + assert_positive_sequence_pair(2, 7); + assert_positive_sequence_pair(7, 3); + assert_positive_sequence_pair(4, 3); + assert_positive_sequence_pair(0, 4); + assert_positive_sequence_pair(0, 5); + assert_positive_sequence_pair(1, 6); + assert_positive_sequence_pair(4, 5); + assert_positive_sequence_pair(4, 7); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt similarity index 71% rename from paddle/contrib/inference/CMakeLists.txt rename to paddle/fluid/inference/api/CMakeLists.txt index 98c2f68a6c39ed12795bad4a905558917c0275a4..9d63d08dedf6a1bcdacc51bb83d2ed261bca4117 100644 --- a/paddle/contrib/inference/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -43,53 +43,64 @@ function(inference_api_test TARGET_NAME) endfunction(inference_api_test) cc_library(paddle_inference_api - SRCS paddle_inference_api.cc paddle_inference_api_impl.cc + SRCS api.cc api_impl.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +if(NOT APPLE) + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym") + set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() # Here the shared library doesn't depend on other fluid libraries, or double free will occur. cc_library(paddle_inference_api_shared SHARED - SRCS paddle_inference_api.cc paddle_inference_api_impl.cc) + SRCS api.cc api_impl.cc) add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api) if(NOT APPLE) - set(LINK_FLAGS "-fPIC -fvisibility=hidden") + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map") set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n" + "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" + " message(FATAL_ERROR \"Check symbol failed.\")\n" + "endif()\n") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" + DEPENDS paddle_inference_api_shared) + add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") endif() cc_test(test_paddle_inference_api - SRCS test_paddle_inference_api.cc + SRCS test_api.cc DEPS paddle_inference_api) -inference_api_test(test_paddle_inference_api_impl +inference_api_test(test_api_impl ARGS test_word2vec test_image_classification) if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine - SRCS paddle_inference_api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api) + SRCS api_tensorrt_subgraph_engine.cc + DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api) -inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec) +inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec) endif() if (WITH_ANAKIN) # only needed in CI # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to # compile the libinference_anakin_api.a and compile with anakin.so. - nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) - nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) + nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc) + nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc) target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_link_libraries(inference_anakin_api anakin anakin_saber_common) target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) if (WITH_TESTING) - cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc + cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin DEPS inference_anakin_api) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING) endif() - -if(WITH_TESTING) - add_subdirectory(demo) -endif() diff --git a/paddle/contrib/inference/README.md b/paddle/fluid/inference/api/README.md similarity index 100% rename from paddle/contrib/inference/README.md rename to paddle/fluid/inference/api/README.md diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/fluid/inference/api/api.cc similarity index 96% rename from paddle/contrib/inference/paddle_inference_api.cc rename to paddle/fluid/inference/api/api.cc index 4fe198ad7d4a752882965e9e7fc460741de53d22..e74f23ff969f5a8f58a71da337c16dcbc14f10c0 100644 --- a/paddle/contrib/inference/paddle_inference_api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { diff --git a/paddle/fluid/inference/api/api.map b/paddle/fluid/inference/api/api.map new file mode 100644 index 0000000000000000000000000000000000000000..5203784dc1fcb672eb6a26d9dfd3ffbe02e08038 --- /dev/null +++ b/paddle/fluid/inference/api/api.map @@ -0,0 +1,6 @@ +{ + global: + *paddle*; + local: + *; +}; diff --git a/paddle/fluid/inference/api/api.sym b/paddle/fluid/inference/api/api.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/fluid/inference/api/api.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc similarity index 89% rename from paddle/contrib/inference/paddle_inference_api_anakin_engine.cc rename to paddle/fluid/inference/api/api_anakin_engine.cc index ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f..f6f3cb335897b02905e24c229b92f3940a37dbf8 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" +#include "paddle/fluid/inference/api/api_anakin_engine.h" #include +#include namespace paddle { @@ -47,13 +48,13 @@ bool PaddleInferenceAnakinPredictor::Run( } auto d_tensor_in_p = executor_.get_in(input.name); float *d_data_p = d_tensor_in_p->mutable_data(); - if (cudaMemcpy(d_data_p, - static_cast(input.data.data()), + if (cudaMemcpy(d_data_p, static_cast(input.data.data()), d_tensor_in_p->valid_size() * sizeof(float), cudaMemcpyHostToDevice) != 0) { LOG(ERROR) << "copy data from CPU to GPU error"; return false; } + cudaStreamSynchronize(NULL); } executor_.prediction(); @@ -69,13 +70,13 @@ bool PaddleInferenceAnakinPredictor::Run( output.data.Resize(tensor->valid_size() * sizeof(float)); } // Copy data from GPU -> CPU - if (cudaMemcpy(output.data.data(), - tensor->mutable_data(), + if (cudaMemcpy(output.data.data(), tensor->mutable_data(), tensor->valid_size() * sizeof(float), cudaMemcpyDeviceToHost) != 0) { LOG(ERROR) << "copy data from GPU to CPU error"; return false; } + cudaStreamSynchronize(NULL); } return true; } @@ -104,13 +105,12 @@ std::unique_ptr PaddleInferenceAnakinPredictor::Clone() { // A factory to help create difference predictor. template <> -std::unique_ptr -CreatePaddlePredictor( - const AnakinConfig &config) { +std::unique_ptr CreatePaddlePredictor< + AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) { VLOG(3) << "Anakin Predictor create."; std::unique_ptr x( new PaddleInferenceAnakinPredictor(config)); return x; -}; +} } // namespace paddle diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h similarity index 88% rename from paddle/contrib/inference/paddle_inference_api_anakin_engine.h rename to paddle/fluid/inference/api/api_anakin_engine.h index 212ba41cdf8ff2feccb6b6498f9679d76a2efe7c..85ca83cd00756cca04d7b92437e9955d8ab297e7 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -19,7 +19,8 @@ limitations under the License. */ #pragma once -#include "paddle/contrib/inference/paddle_inference_api.h" +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" // from anakin #include "framework/core/net/net.h" @@ -31,7 +32,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { public: PaddleInferenceAnakinPredictor() {} - PaddleInferenceAnakinPredictor(const AnakinConfig& config); + explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config); // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. @@ -48,8 +49,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { private: bool Init(const AnakinConfig& config); - anakin::graph::Graph graph_; anakin::Net diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc similarity index 97% rename from paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc rename to paddle/fluid/inference/api/api_anakin_engine_tester.cc index f92e9d4190412f5847e353ef1dc0324cad668c9a..d6d631bfbad4278fe99e4553a410a9d9162dcc7b 100644 --- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc +++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" DEFINE_string(model, "", "Directory of the inference model."); diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/fluid/inference/api/api_impl.cc similarity index 92% rename from paddle/contrib/inference/paddle_inference_api_impl.cc rename to paddle/fluid/inference/api/api_impl.cc index b1e5b875981e0142f6970cf6864b7b598743654b..9d9e126e134deafc18f10f7299e81d92702e9ca0 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include #include -#include "paddle/contrib/inference/paddle_inference_api_impl.h" +#include "paddle/fluid/inference/api/api_impl.h" namespace paddle { namespace { @@ -77,8 +77,8 @@ bool NativePaddlePredictor::Init( if (!config_.model_dir.empty()) { // Parameters are saved in separate files sited in // the specified `dirname`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.model_dir); + inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), + config_.model_dir); } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used @@ -91,8 +91,8 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); - executor_->CreateVariables( - *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); + executor_->CreateVariables(*inference_program_, + sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names feed_target_names_ = inference_program_->GetFeedTargetNames(); @@ -105,7 +105,7 @@ NativePaddlePredictor::~NativePaddlePredictor() { PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!"); scope_->DeleteScope(sub_scope_); } -}; +} bool NativePaddlePredictor::Run(const std::vector &inputs, std::vector *output_data) { @@ -134,10 +134,8 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, // if share variables, we need not create variables VLOG(4) << "Run prepared context"; executor_->RunPreparedContext( - ctx_.get(), - sub_scope_ != nullptr ? sub_scope_ : scope_.get(), - &feed_targets, - &fetch_targets, + ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(), + &feed_targets, &fetch_targets, false /* don't create variable eatch time */); VLOG(4) << "Finish prepared context"; if (!GetFetch(fetchs, output_data)) { @@ -181,8 +179,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, } // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. - std::memcpy(static_cast(input_ptr), - inputs[i].data.data(), + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); feeds->push_back(input); } @@ -232,8 +229,7 @@ bool NativePaddlePredictor::GetFetch( size_t start = lod[0][j - 1] * common_dim; size_t end = lod[0][j] * common_dim; if (end > start) { - std::copy(output_ptr + start, - output_ptr + end, + std::copy(output_ptr + start, output_ptr + end, data.begin() + (j - 1) * max_dim * common_dim); } } @@ -257,15 +253,13 @@ bool NativePaddlePredictor::GetFetch( } template <> -std::unique_ptr -CreatePaddlePredictor( - const NativeConfig &config) { +std::unique_ptr CreatePaddlePredictor< + NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { VLOG(3) << "create NativePaddlePredictor"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, - 0.f, + config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); std::vector flags; diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/fluid/inference/api/api_impl.h similarity index 97% rename from paddle/contrib/inference/paddle_inference_api_impl.h rename to paddle/fluid/inference/api/api_impl.h index f9ec6f55449fc46b4a44b9563980cb5f8e80a951..92e693578ab657004f3c40c09b979897afea1e1f 100644 --- a/paddle/contrib/inference/paddle_inference_api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -19,7 +19,7 @@ #include #include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc similarity index 93% rename from paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc rename to paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index a11396cee91a758e86af2efd9e58b9da68442590..0cdc88fa1eaf3935ce0da143e1e91eb84cd70dcf 100644 --- a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/contrib/inference/paddle_inference_api.h" -#include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -77,8 +77,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(5) << "to create variables"; - executor_->CreateVariables( - *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); + executor_->CreateVariables(*inference_program_, + sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names feed_target_names_ = inference_program_->GetFeedTargetNames(); @@ -98,8 +98,7 @@ CreatePaddlePredictor( if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, - 0.f, + config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); std::vector flags; diff --git a/paddle/fluid/inference/api/check_symbol.sh b/paddle/fluid/inference/api/check_symbol.sh new file mode 100755 index 0000000000000000000000000000000000000000..6547ca1413649968e8a0be146915e07192a99898 --- /dev/null +++ b/paddle/fluid/inference/api/check_symbol.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lib=$1 +if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi + +num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l) +num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l) + +if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi +if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi + +exit 0 diff --git a/paddle/fluid/inference/api/demo_ci/.gitignore b/paddle/fluid/inference/api/demo_ci/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/.gitignore @@ -0,0 +1 @@ +data diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f9bb4b33e97b5ea37e9216b00ce0c82ca3ce230 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -0,0 +1,75 @@ +cmake_minimum_required(VERSION 3.0) + +project(cpp_inference_demo CXX C) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) + +if(WITH_GPU) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") +endif() + +include_directories("${PADDLE_LIB}") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") + +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") + +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) + +if(WITH_MKL) + include_directories("${PADDLE_LIB}/third_party/install/mklml/include") + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so + ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so) + set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif() +else() + set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) +endif() + +# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a +if(WITH_STATIC_LIB) + set(DEPS + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a) +else() + set(DEPS + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so + ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so) +endif() +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf snappystream snappy z + ${EXTERNAL_LIB}) +if(WITH_GPU) + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so) +endif() + +target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/inference/api/demo_ci/README.md b/paddle/fluid/inference/api/demo_ci/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f013da7f30acd84ec484773f4ea716a08efa0ff --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/README.md @@ -0,0 +1,26 @@ +# Inference Demos + +There are several demos: + +- simple_on_word2vec: + - Follow the C++ codes is in `simple_on_word2vec.cc`. + - It is suitable for word2vec model. +- vis_demo: + - Follow the C++ codes is in `vis_demo.cc`. + - It is suitable for mobilenet, se_resnext50 and ocr three models. + - Input data format: + - Each line contains a single record + - Each record's format is + ``` + \t + ``` + +To build and execute the demos, simply run +``` +./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU +``` +- It will build and execute the demos in both static and shared library. +- `$PADDLE_ROOT`: paddle library path +- `$TURN_ON_MKL`: use MKL or Openblas +- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode +- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first. diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..3e829dd726b132844a45427b7b0b39eedf197496 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -0,0 +1,81 @@ +set -x +PADDLE_ROOT=$1 +TURN_ON_MKL=$2 # use MKL or Openblas +TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode +if [ $2 == ON ]; then + # You can export yourself if move the install path + MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB} +fi +if [ $3 == ON ]; then + use_gpu_list='true false' +else + use_gpu_list='false' +fi + +# download vis_demo data +function download() { + dir_name=$1 + mkdir -p $dir_name + cd $dir_name + wget -q ${URL_ROOT}$dir_name.tar.gz + tar xzf *.tar.gz + cd .. +} +URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F +mkdir -p data +cd data +vis_demo_list='se_resnext50 ocr mobilenet' +for vis_demo_name in $vis_demo_list; do + download $vis_demo_name +done +cd .. + +# compile and test the demo +mkdir -p build +cd build + +for WITH_STATIC_LIB in ON OFF; do + # -----simple_on_word2vec----- + rm -rf * + cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=simple_on_word2vec \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB + make -j + word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' + if [ -d $word2vec_model ]; then + for use_gpu in $use_gpu_list; do + ./simple_on_word2vec \ + --dirname=$word2vec_model \ + --use_gpu=$use_gpu + if [ $? -ne 0 ]; then + echo "simple_on_word2vec demo runs fail." + exit 1 + fi + done + fi + # ---------vis_demo--------- + rm -rf * + cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=vis_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB + make -j + for use_gpu in $use_gpu_list; do + for vis_demo_name in $vis_demo_list; do + ./vis_demo \ + --modeldir=../data/$vis_demo_name/model \ + --data=../data/$vis_demo_name/data.txt \ + --refer=../data/$vis_demo_name/result.txt \ + --use_gpu=$use_gpu + if [ $? -ne 0 ]; then + echo "vis demo $vis_demo_name runs fail." + exit 1 + fi + done + done +done +set +x diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc similarity index 67% rename from paddle/contrib/inference/demo/simple_on_word2vec.cc rename to paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index c253014642f39a042430992548a285cc7078a959..5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7 100644 --- a/paddle/contrib/inference/demo/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -16,21 +16,27 @@ limitations under the License. */ * This file contains a simple demo for how to take a model for inference. */ +#include #include -#include #include -#include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include //NOLINT +#include "paddle/fluid/inference/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_bool(use_gpu, false, "Whether use gpu."); namespace paddle { namespace demo { -DEFINE_string(dirname, "", "Directory of the inference model."); - void Main(bool use_gpu) { //# 1. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + if (FLAGS_dirname.empty()) { + LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model"; + exit(1); + } + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -54,12 +60,16 @@ void Main(bool use_gpu) { CHECK(predictor->Run(slots, &outputs)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } } @@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) { // Multi-threads only support on CPU // 0. Create PaddlePredictor with a config. NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname; config.use_gpu = use_gpu; config.fraction_of_gpu_memory = 0.15; config.device = 0; @@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) { CHECK(predictor->Run(inputs, &outputs)); // 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "TID: " << tid << ", " - << "output buffer size: " << outputs.front().data.length(); + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } }); @@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) { } } -TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); } - -#ifdef PADDLE_WITH_CUDA -TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); } -TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); } -#endif - } // namespace demo } // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(false /* use_gpu*/); + paddle::demo::MainThreads(1, false /* use_gpu*/); + paddle::demo::MainThreads(4, false /* use_gpu*/); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + paddle::demo::MainThreads(1, true /*use_gpu*/); + paddle::demo::MainThreads(4, true /*use_gpu*/); + } + return 0; +} diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h similarity index 93% rename from paddle/contrib/inference/demo/utils.h rename to paddle/fluid/inference/api/demo_ci/utils.h index b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1..cb8990671162dff47228736e69617229528cc093 100644 --- a/paddle/contrib/inference/demo/utils.h +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -13,16 +13,15 @@ // limitations under the License. #pragma once +#include #include #include - -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/inference/paddle_inference_api.h" namespace paddle { namespace demo { -static void split(const std::string& str, - char sep, +static void split(const std::string& str, char sep, std::vector* pieces) { pieces->clear(); if (str.empty()) { diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc similarity index 74% rename from paddle/contrib/inference/demo/vis_demo.cc rename to paddle/fluid/inference/api/demo_ci/vis_demo.cc index 45575f9a862de430236ae20cf498e542a45b1f4b..0a2a2b713ab21a3124d8a85ba469f64278623ec4 100644 --- a/paddle/contrib/inference/demo/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -18,26 +18,24 @@ limitations under the License. */ #include #include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. -#include #include #include -#include "paddle/contrib/inference/demo/utils.h" -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" +#include "utils.h" #ifdef PADDLE_WITH_CUDA DECLARE_double(fraction_of_gpu_memory_to_use); #endif - -namespace paddle { -namespace demo { - DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(refer, "", "path to reference result for comparison."); DEFINE_string( - data, - "", + data, "", "path of data; each line is a record, format is " "'\t data; @@ -47,7 +45,7 @@ struct Record { void split(const std::string& str, char sep, std::vector* pieces); Record ProcessALine(const std::string& line) { - LOG(INFO) << "process a line"; + VLOG(3) << "process a line"; std::vector columns; split(line, '\t', &columns); CHECK_EQ(columns.size(), 2UL) @@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) { for (auto& s : shape_strs) { record.shape.push_back(std::stoi(s)); } - LOG(INFO) << "data size " << record.data.size(); - LOG(INFO) << "data shape size " << record.shape.size(); + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); return record; } @@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { file.close(); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); - LOG(INFO) << "predictor output numel " << numel; - LOG(INFO) << "reference output numel " << refer.data.size(); - EXPECT_EQ(numel, refer.data.size()); + VLOG(3) << "predictor output numel " << numel; + VLOG(3) << "reference output numel " << refer.data.size(); + PADDLE_ENFORCE_EQ(numel, refer.data.size()); switch (output.dtype) { case PaddleDType::INT64: { for (size_t i = 0; i < numel; ++i) { - EXPECT_EQ(static_cast(output.data.data())[i], refer.data[i]); + PADDLE_ENFORCE_EQ(static_cast(output.data.data())[i], + refer.data[i]); } break; } case PaddleDType::FLOAT32: for (size_t i = 0; i < numel; ++i) { - EXPECT_NEAR( - static_cast(output.data.data())[i], refer.data[i], 1e-5); + PADDLE_ENFORCE_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); } break; } @@ -106,15 +106,15 @@ void Main(bool use_gpu) { config.prog_file = FLAGS_modeldir + "/__model__"; config.use_gpu = use_gpu; config.device = 0; -#ifdef PADDLE_WITH_CUDA - config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use; -#endif + if (FLAGS_use_gpu) { + config.fraction_of_gpu_memory = 0.1; // set by yourself + } - LOG(INFO) << "init predictor"; + VLOG(3) << "init predictor"; auto predictor = CreatePaddlePredictor(config); - LOG(INFO) << "begin to process data"; + VLOG(3) << "begin to process data"; // Just a single batch of data. std::string line; std::ifstream file(FLAGS_data); @@ -129,21 +129,26 @@ void Main(bool use_gpu) { .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), .dtype = PaddleDType::FLOAT32}; - LOG(INFO) << "run executor"; + VLOG(3) << "run executor"; std::vector output; predictor->Run({input}, &output); - LOG(INFO) << "output.size " << output.size(); + VLOG(3) << "output.size " << output.size(); auto& tensor = output.front(); - LOG(INFO) << "output: " << SummaryTensor(tensor); + VLOG(3) << "output: " << SummaryTensor(tensor); // compare with reference result CheckOutput(FLAGS_refer, tensor); } -TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); } -#ifdef PADDLE_WITH_CUDA -TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); } -#endif } // namespace demo } // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(false /* use_gpu*/); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + } + return 0; +} diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md similarity index 100% rename from paddle/contrib/inference/high_level_api.md rename to paddle/fluid/inference/api/high_level_api.md diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md similarity index 100% rename from paddle/contrib/inference/high_level_api_cn.md rename to paddle/fluid/inference/api/high_level_api_cn.md diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h similarity index 100% rename from paddle/contrib/inference/paddle_inference_api.h rename to paddle/fluid/inference/api/paddle_inference_api.h diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/fluid/inference/api/test_api.cc similarity index 96% rename from paddle/contrib/inference/test_paddle_inference_api.cc rename to paddle/fluid/inference/api/test_api.cc index bc7faab6e208a66d7a56e41a56bd743c7644eea2..ac8a21a22be6f27311b8ae2507d04d9d1b510e76 100644 --- a/paddle/contrib/inference/test_paddle_inference_api.cc +++ b/paddle/fluid/inference/api/test_api.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/contrib/inference/paddle_inference_api.h" - #include #include +#include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/fluid/inference/api/test_api_impl.cc similarity index 98% rename from paddle/contrib/inference/test_paddle_inference_api_impl.cc rename to paddle/fluid/inference/api/test_api_impl.cc index c3649dcb96c77f449d876bef34c4aea7afb31daa..fc1364b80ac1ee2d304eb2fe429eae5f56967516 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/fluid/inference/api/test_api_impl.cc @@ -15,10 +15,10 @@ limitations under the License. */ #include #include -#include +#include // NOLINT #include "gflags/gflags.h" -#include "paddle/contrib/inference/paddle_inference_api_impl.h" +#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); @@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) { // which should be in the range [0.0, 1.0]. feed_target_shapes[0][0] = batch_size; framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); - SetupTensor( - &input, input_dims, static_cast(0), static_cast(1)); + SetupTensor(&input, input_dims, static_cast(0), + static_cast(1)); std::vector cpu_feeds; cpu_feeds.push_back(&input); diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc similarity index 96% rename from paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc rename to paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc index b100630dbe412ca811f1a8f2b8191356f5ebec2f..585f6d29376c3341c21ff76361d5335512c1b1b6 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc @@ -15,7 +15,7 @@ #include #include #include -#include "paddle/contrib/inference/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { @@ -61,4 +61,4 @@ void Main(bool use_gpu) { TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); } -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym new file mode 100644 index 0000000000000000000000000000000000000000..ef2a04d788aa86b7f6a61c4af479d70d1137f374 --- /dev/null +++ b/paddle/fluid/inference/paddle_fluid.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ab1d2143330fb8cbfd535758a83bc71de939c4e0..d265150f25419509126028e36e629aee3ee6bd0f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) op_library(lstm_op DEPS sequence2batch lstm_compute) +op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) +op_library(unsqueeze_op DEPS reshape_op) +op_library(squeeze_op DEPS reshape_op) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index c9871a9fe6b3b0d0cf671c2d155715f92c94fd8f..6bd3e491bccb037406b784147dc9f91049b34d53 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -35,7 +35,14 @@ class AucOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(inference_height, label_height, "Out and Label should have same height."); + int num_thres = ctx->Attrs().Get("num_thresholds"); + ctx->SetOutputDim("AUC", {1}); + ctx->SetOutputDim("TPOut", {num_thres}); + ctx->SetOutputDim("TNOut", {num_thres}); + ctx->SetOutputDim("FPOut", {num_thres}); + ctx->SetOutputDim("FNOut", {num_thres}); + ctx->ShareLoD("Out", /*->*/ "AUC"); } @@ -63,10 +70,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "A 2D int tensor indicating the label of the training data." "The height is batch size and width is always 1."); + AddInput("TP", "True-Positive value."); + AddInput("FP", "False-Positive value."); + AddInput("TN", "True-Negative value."); + AddInput("FN", "False-Negative value."); // TODO(typhoonzero): support weight input AddOutput("AUC", "A scalar representing the " "current area-under-the-curve."); + AddOutput("TPOut", "True-Positive value."); + AddOutput("FPOut", "False-Positive value."); + AddOutput("TNOut", "True-Negative value."); + AddOutput("FNOut", "False-Negative value."); AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") .SetDefault("ROC"); diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index 8b016c3d31ad83e66baeb298c61840cc529efa1e..58fefc1600dfb7df3e3d71959c047865ed5e2e39 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -34,6 +34,12 @@ class AucKernel : public framework::OpKernel { auto* inference = ctx.Input("Out"); auto* label = ctx.Input("Label"); auto* auc = ctx.Output("AUC"); + // Only use output var for now, make sure it's persistable and + // not cleaned up for each batch. + auto* true_positive = ctx.Output("TPOut"); + auto* false_positive = ctx.Output("FPOut"); + auto* true_negative = ctx.Output("TNOut"); + auto* false_negative = ctx.Output("FNOut"); float* auc_data = auc->mutable_data(ctx.GetPlace()); @@ -54,19 +60,10 @@ class AucKernel : public framework::OpKernel { const T* inference_data = inference->data(); const int64_t* label_data = label->data(); - // Create local tensor for storing the curve: TP, FN, TN, FP - // TODO(typhoonzero): use eigen op to caculate these values. - Tensor true_positive, false_positive, true_negative, false_negative; - - true_positive.Resize({num_thresholds}); - false_negative.Resize({num_thresholds}); - true_negative.Resize({num_thresholds}); - false_positive.Resize({num_thresholds}); - - int64_t* tp_data = true_positive.mutable_data(ctx.GetPlace()); - int64_t* fn_data = false_negative.mutable_data(ctx.GetPlace()); - int64_t* tn_data = true_negative.mutable_data(ctx.GetPlace()); - int64_t* fp_data = false_positive.mutable_data(ctx.GetPlace()); + auto* tp_data = true_positive->mutable_data(ctx.GetPlace()); + auto* fn_data = false_negative->mutable_data(ctx.GetPlace()); + auto* tn_data = true_negative->mutable_data(ctx.GetPlace()); + auto* fp_data = false_positive->mutable_data(ctx.GetPlace()); for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { // caculate TP, FN, TN, FP for current thresh @@ -91,10 +88,10 @@ class AucKernel : public framework::OpKernel { } } // store rates - tp_data[idx_thresh] = tp; - fn_data[idx_thresh] = fn; - tn_data[idx_thresh] = tn; - fp_data[idx_thresh] = fp; + tp_data[idx_thresh] += tp; + fn_data[idx_thresh] += fn; + tn_data[idx_thresh] += tn; + fp_data[idx_thresh] += fp; } // epsilon to avoid divide by zero. float epsilon = 1e-6; diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc index c4219a429a53eb4869426a2674109555fb784b85..3a2527e407bb179c4873fa3ffe2e8f22fb47faf7 100644 --- a/paddle/fluid/operators/checkpoint_notify_op.cc +++ b/paddle/fluid/operators/checkpoint_notify_op.cc @@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase { VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name << " and dir:" << dir << " to " << epmap[i]; } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 6b06913d1c83f4534238ac3dd22ac4035c0f0fbf..5098bd8700e11c9a2faeba90c38ed2d9499b17cf 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -18,9 +18,6 @@ namespace paddle { namespace operators { -using conv_bwd_data = mkldnn::convolution_backward_data; -using conv_bwd_weights = mkldnn::convolution_backward_weights; -using conv_fwd = mkldnn::convolution_forward; using framework::DataLayout; using mkldnn::memory; using mkldnn::primitive; @@ -29,6 +26,196 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; +class ConvMKLDNNHandler : public platform::MKLDNNHandler { + public: + ConvMKLDNNHandler( + std::shared_ptr conv_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + conv_pd_ = conv_pd; + } + + ConvMKLDNNHandler( + std::shared_ptr conv_pd, + std::shared_ptr + conv_bwd_data_pd, + std::shared_ptr + conv_bwd_weights_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + conv_pd_(conv_pd), + conv_bwd_weights_pd_(conv_bwd_weights_pd), + conv_bwd_data_pd_(conv_bwd_data_pd) { + // If we are in Grad operatgor then update a key with BWD suffix to + // distinguish from FWD memory primitives + key_ += "-BWD"; + } + + std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, + "@weights-src_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, + "@weights-diff_dst_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( + void* ptr) { + return this->AcquireMemoryFromPrimitive( + conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr, + "@diff_weights_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, + "@data-diff_dst_mem_p", pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline) { + auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); + auto user_pd = user_weights_memory_p->get_primitive_desc(); + return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, + "@data-weights_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( + void* ptr) { + return this->AcquireMemoryFromPrimitive( + conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); + } + + std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); + } + + std::shared_ptr AcquireSrcMemoryFromPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { + auto src_pd = conv_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", + pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline) { + auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); + auto weights_pd = conv_pd_->weights_primitive_desc(); + return this->AcquireMemory(weights_pd, user_weights_pd, + user_weights_memory_p, "@weights_mem_p", + pipeline); + } + + std::shared_ptr AcquireConvolution( + std::shared_ptr src_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr dst_memory_p) { + auto prim_key = key_ + "@conv_p"; + auto conv_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution primitive in device context"); + if (conv_p == nullptr) { + conv_p = std::make_shared( + *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), + *(dst_memory_p.get())); + + dev_ctx_.SetBlob(prim_key, conv_p); + } else { + is_reusing_ = true; + } + return conv_p; + } + + std::shared_ptr + AcquireConvolutionBackwardWeights( + std::shared_ptr src_memory_p, + std::shared_ptr diff_dst_memory_p, + std::shared_ptr diff_weights_memory_p) { + auto prim_key = key_ + "@conv_bwd_weights_p"; + auto conv_bwd_weights_p = + std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE( + (conv_bwd_weights_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution bwd weights primitive in device context"); + if (conv_bwd_weights_p == nullptr) { + // create backward conv primitive for weights + conv_bwd_weights_p = + std::make_shared( + *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p, + *diff_weights_memory_p); + dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); + } else { + is_reusing_ = true; + } + return conv_bwd_weights_p; + } + + std::shared_ptr + AcquireConvolutionBackwardData( + std::shared_ptr diff_dst_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr diff_src_memory_p) { + auto prim_key = key_ + "@conv_bwd_data_p"; + auto conv_bwd_data_p = + std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE( + (conv_bwd_data_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution bwd data primitive in device context"); + if (conv_bwd_data_p == nullptr) { + conv_bwd_data_p = std::make_shared( + *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p, + *diff_src_memory_p); + dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); + } else { + is_reusing_ = true; + } + return conv_bwd_data_p; + } + + // Generate keys for storing/retriving primitives for this operator + // TODO(jczaja): Make hashing function more optimial + static std::string GetHash(memory::dims& input_dims, + memory::dims& weights_dims, + std::vector& strides, + std::vector& paddings, + std::vector& dilations, int groups, + const std::string& suffix) { + return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + + suffix; + } + + private: + std::shared_ptr conv_pd_; + std::shared_ptr + conv_bwd_weights_pd_; + std::shared_ptr + conv_bwd_data_pd_; +}; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -36,10 +223,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - // Get unique name for index - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -80,86 +263,84 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // create mkldnn memory from input tensors (data/weights) - auto user_src_memory = memory( - {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, - to_void_cast(input_data)); - auto user_weights_memory = - memory({{{weights_tz}, memory::data_type::f32, filter->format()}, - mkldnn_engine}, - to_void_cast(filter_data)); + // Get unique name for storing MKLDNN primitives + const std::string key = ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector pipeline; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose * the memory format preferred for best performance */ - auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, - memory::format::any); + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), memory::format::any); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::f32, memory::format::any); - auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, - memory::format::any); + weights_tz, platform::MKLDNNGetDataType(), memory::format::any); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), memory::format::any); // create a conv primitive descriptor and save it for usage in backward - std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( - src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + std::shared_ptr conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine); + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); - // create reorder primitive if the input format is not the preferred one - auto src_memory = user_src_memory; - primitive reorder_src; - bool is_src_reordered = false; - if (memory::primitive_desc(conv_pd->src_primitive_desc()) != - user_src_memory.get_primitive_desc()) { - src_memory = memory(conv_pd->src_primitive_desc()); - reorder_src = reorder(user_src_memory, src_memory); - is_src_reordered = true; - } - auto weights_memory = user_weights_memory; - primitive reorder_weights; - bool is_weights_reordered = false; - if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != - user_weights_memory.get_primitive_desc()) { - weights_memory = memory(conv_pd->weights_primitive_desc()); - reorder_weights = reorder(user_weights_memory, weights_memory); - is_weights_reordered = true; - } + ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); - // create memory primitive for conv dst - auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline); + auto dst_memory_p = + handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); // create convolution op primitive - auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); + auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); // push primitive to stream and wait until it's executed - std::vector pipeline; - if (is_src_reordered) pipeline.push_back(reorder_src); - if (is_weights_reordered) pipeline.push_back(reorder_weights); - pipeline.push_back(conv_prim); + pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(dst_memory)); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } private: - std::unique_ptr ConvFwdPrimitiveDesc( - const memory::desc& src, const memory::desc& weights, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, const mkldnn::engine& engine) const { + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; - auto conv_desc = - conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, - src, weights, dst, stride_dims, padding_dims, - padding_dims, mkldnn::padding_kind::zero); + auto conv_desc = mkldnn::convolution_forward::desc( + mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, + dst, stride_dims, padding_dims, padding_dims, + mkldnn::padding_kind::zero); - auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); + auto p_conv_pd = + new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); - return std::unique_ptr(p_conv_pd); + return std::unique_ptr( + p_conv_pd); } }; @@ -197,13 +378,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { if (!input_grad && !filter_grad) return; - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Input("Output"); - const std::string key_conv_pd = key + "@conv_pd"; - std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -223,146 +401,116 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // create mkldnn memory from input tensors (input/weights/output_grad) - auto user_src_memory = memory( - {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, - to_void_cast(input_data)); - auto user_weights_memory = - memory({{{weights_tz}, memory::data_type::f32, filter->format()}, - mkldnn_engine}, - to_void_cast(filter_data)); - auto user_diff_dst_memory = - memory({{{dst_tz}, memory::data_type::f32, output_grad->format()}, - mkldnn_engine}, - to_void_cast(output_grad_data)); + // Get an unique name from "argument" name of "Output" variable + // as well as attributes of primitive to be created + // This name will be used as key when saving info into device context + const std::string key = + ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings, + dilations, groups, ctx.op().Input("Output")); + + const std::string key_conv_pd = key + "@conv_pd"; + std::vector pipeline; + + // Create user memory descriptors + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + auto user_diff_dst_md = platform::MKLDNNMemDesc( + {dst_tz}, platform::MKLDNNGetDataType(), output_grad->format()); /* create memory descriptor for conv backward without specified format * ('any') which lets a primitive (conv backward in this case) choose * the memory format preferred for best performance */ - auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, - memory::format::any); - auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, - memory::format::any); + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), memory::format::any); + auto diff_src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), memory::format::any); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::f32, memory::format::any); + weights_tz, platform::MKLDNNGetDataType(), memory::format::any); auto diff_weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::f32, memory::format::any); - auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, - memory::format::any); + weights_tz, platform::MKLDNNGetDataType(), memory::format::any); + auto diff_dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), memory::format::any); // Retrieve conv_pd from device context - auto conv_pd = std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); + auto conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); + // create backward convolution weights primitive descriptor + auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( + mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_weights_pd = + std::make_shared( + conv_bwd_weights_desc, mkldnn_engine, *conv_pd); + + // create backward convolution data primitive descriptor + auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( + mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_data_pd = + std::make_shared( + conv_bwd_data_desc, mkldnn_engine, *conv_pd); + + ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd, + dev_ctx, mkldnn_engine, key); + + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory( + user_diff_dst_md, to_void_cast(output_grad_data)); + // create backward conv primitive for weights if (filter_grad) { - // create backward convolution primitive descriptor - auto conv_bwd_weights_desc = conv_bwd_weights::desc( - mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, - strides, paddings, paddings, mkldnn::padding_kind::zero); - auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( - conv_bwd_weights_desc, mkldnn_engine, *conv_pd); - - // create reorder primitive if the input format is not the preferred one - auto src_memory = user_src_memory; - primitive reorder_src; - bool is_src_reordered = false; - if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != - user_src_memory.get_primitive_desc()) { - src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); - reorder_src = reorder(user_src_memory, src_memory); - is_src_reordered = true; - } - - auto diff_dst_memory_4filter = user_diff_dst_memory; - primitive reorder_diff_dst_4filter; - bool is_diff_dst_reordered_4filter = false; - if (memory::primitive_desc( - conv_bwd_weights_pd.diff_dst_primitive_desc()) != - user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory_4filter = - memory(conv_bwd_weights_pd.diff_dst_primitive_desc()); - reorder_diff_dst_4filter = - reorder(user_diff_dst_memory, diff_dst_memory_4filter); - is_diff_dst_reordered_4filter = true; - } - - // create mkldnn memory for output (i.e. diff weights) - auto diff_weights_memory = - memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), - reinterpret_cast(filter_grad_data)); + auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive( + user_src_memory_p, pipeline); - // create backward conv primitive for weights - auto conv_bwd_weights_prim = - conv_bwd_weights(conv_bwd_weights_pd, src_memory, - diff_dst_memory_4filter, diff_weights_memory); - - // push primitive and execute it - std::vector pipeline; - if (is_src_reordered) pipeline.push_back(reorder_src); - if (is_diff_dst_reordered_4filter) - pipeline.push_back(reorder_diff_dst_4filter); - pipeline.push_back(conv_bwd_weights_prim); - stream(stream::kind::eager).submit(pipeline).wait(); + auto diff_dst_memory_4filter_p = + handler.AcquireDiffDstMemoryFromWeightsPrimitive( + user_diff_dst_memory_p, pipeline); + + auto diff_weights_memory_p = + handler.AcquireDiffWeightsMemoryFromWeightsPrimitive( + reinterpret_cast(filter_grad_data)); + + auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights( + src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p); + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_bwd_weights_p); filter_grad->set_layout(DataLayout::kMKLDNN); - filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); + filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); } if (input_grad) { - // create backward convolution primitive descriptor - auto conv_bwd_data_desc = conv_bwd_data::desc( - mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, - strides, paddings, paddings, mkldnn::padding_kind::zero); - auto conv_bwd_data_pd = conv_bwd_data::primitive_desc( - conv_bwd_data_desc, mkldnn_engine, *conv_pd); - - // create reorder primitive if the input format is not the preferred one - auto weights_memory = user_weights_memory; - primitive reorder_weights; - bool is_weights_reordered = false; - if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) != - user_weights_memory.get_primitive_desc()) { - weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc()); - reorder_weights = reorder(user_weights_memory, weights_memory); - is_weights_reordered = true; - } - - auto diff_dst_memory_4data = user_diff_dst_memory; - primitive reorder_diff_dst_4data; - bool is_diff_dst_reordered_4data = false; - if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) != - user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory_4data = - memory(conv_bwd_data_pd.diff_dst_primitive_desc()); - reorder_diff_dst_4data = - reorder(user_diff_dst_memory, diff_dst_memory_4data); - is_diff_dst_reordered_4data = true; - } - - // create mkldnn memory for output (i.e. diff src) - auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(), - reinterpret_cast(input_grad_data)); - - // create backward conv primitive for data - auto conv_bwd_data_prim = - conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory, - diff_src_memory); - - // push primitive and execute it - std::vector pipeline; - if (is_weights_reordered) pipeline.push_back(reorder_weights); - if (is_diff_dst_reordered_4data) - pipeline.push_back(reorder_diff_dst_4data); - pipeline.push_back(conv_bwd_data_prim); - stream(stream::kind::eager).submit(pipeline).wait(); + auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive( + user_weights_memory_p, pipeline); + + auto diff_dst_memory_4data_p = + handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p, + pipeline); + + auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive( + reinterpret_cast(input_grad_data)); + + auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData( + diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p); + + pipeline.push_back(*conv_bwd_data_p); input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); + input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } + stream(stream::kind::eager).submit(pipeline).wait(); } // Compute() }; diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index 4e35c38e4e03d4d0f00601812fdc4803519b89ae..b5cb6a724c095eb849f3a184f13843e1a0cca92f 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { "(float) " "Prior boxes center offset.") .SetDefault(0.5); + AddAttr( + "min_max_aspect_ratios_order", + "(bool) If set True, the output prior box is in order of" + "[min, max, aspect_ratios], which is consistent with Caffe." + "Please note, this order affects the weights order of convolution layer" + "followed by and does not affect the final detection results.") + .SetDefault(false); AddComment(R"DOC( Prior box operator Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu index f67e6ca91c0852b5a3be35d23246884d1157caa4..1ea8cfc1d2af8cc6c332768a467cdcd4c0166319 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cu +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, const int im_width, const int as_num, const T offset, const T step_width, const T step_height, const T* min_sizes, - const T* max_sizes, const int min_num, - bool is_clip) { + const T* max_sizes, const int min_num, bool is_clip, + bool min_max_aspect_ratios_order) { int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; int box_num = height * width * num_priors; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; @@ -44,14 +44,28 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, T min_size = min_sizes[m]; if (max_sizes) { int s = p % (as_num + 1); - if (s < as_num) { - T ar = aspect_ratios[s]; - bw = min_size * sqrt(ar) / 2.; - bh = min_size / sqrt(ar) / 2.; + if (!min_max_aspect_ratios_order) { + if (s < as_num) { + T ar = aspect_ratios[s]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } else { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } } else { - T max_size = max_sizes[m]; - bw = sqrt(min_size * max_size) / 2.; - bh = bw; + if (s == 0) { + bw = bh = min_size / 2.; + } else if (s == 1) { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } else { + T ar = aspect_ratios[s - 1]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } } } else { int s = p % as_num; @@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { auto variances = ctx.Attr>("variances"); auto flip = ctx.Attr("flip"); auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); std::vector aspect_ratios; ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); @@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { GenPriorBox<<>>( boxes->data(), r.data(), height, width, im_height, im_width, aspect_ratios.size(), offset, step_width, step_height, min.data(), - max_data, min_num, clip); + max_data, min_num, clip, min_max_aspect_ratios_order); framework::Tensor v; framework::TensorFromVector(variances, ctx.device_context(), &v); diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h index 1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35..4e226abbb51c271502f0ca5419d488643b5a1a82 100644 --- a/paddle/fluid/operators/detection/prior_box_op.h +++ b/paddle/fluid/operators/detection/prior_box_op.h @@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel { auto variances = ctx.Attr>("variances"); auto flip = ctx.Attr("flip"); auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); std::vector aspect_ratios; ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); @@ -108,26 +110,59 @@ class PriorBoxOpKernel : public framework::OpKernel { int idx = 0; for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; - // priors with different aspect ratios - for (size_t r = 0; r < aspect_ratios.size(); ++r) { - float ar = aspect_ratios[r]; - box_width = min_size * sqrt(ar) / 2.; - box_height = min_size / sqrt(ar) / 2.; - e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; - e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; - e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; - e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; - idx++; - } - if (max_sizes.size() > 0) { - auto max_size = max_sizes[s]; - // square prior with size sqrt(minSize * maxSize) - box_width = box_height = sqrt(min_size * max_size) / 2.; + if (min_max_aspect_ratios_order) { + box_width = box_height = min_size / 2.; e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; idx++; + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + } else { + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; + e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; + e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; + e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; + idx++; + } } } } diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 3b0c9b2886504ee381b2b33e06a4552602725e57..9a1643d5b35c067ba9064286bab32019fb34fbe8 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel { std::minstd_rand engine, std::vector* inds) const { std::uniform_real_distribution uniform(0, 1); - if (inds->size() > num) { - for (int i = num; i < inds->size(); ++i) { + const int64_t size = static_cast(inds->size()); + if (size > num) { + for (int64_t i = num; i < size; ++i) { int rng_ind = std::floor(uniform(engine) * i); if (rng_ind < num) std::iter_swap(inds->begin() + rng_ind + offset, diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 35318a805898de645c844a2224f6df8c458d346c..4d60801b6a6ecaabf1165321e0cb19044d27aa34 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, req_count_++; } -void GRPCClient::Wait() { +bool GRPCClient::Wait() { std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); + return ok_; } void GRPCClient::Proceed() { @@ -297,6 +298,14 @@ void GRPCClient::Proceed() { if (c->status_.ok()) { VLOG(3) << c->var_h_.String() << " process"; c->Process(); + } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { + LOG(ERROR) << c->var_h_.String() + << " meets grpc error:" << c->status_.error_message(); + { + std::lock_guard lk(sync_mutex_); + ok_ = false; + } + sync_cond_.notify_all(); } else { LOG(FATAL) << c->var_h_.String() << " meets grpc error:" << c->status_.error_message(); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 5dae20155edcf9edd746a5d9a9bbe0ccd789f431..d03a3e56aedbe4a008ee9ff187111f7635d14b58 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { class GRPCClient : public RPCClient { public: - GRPCClient() {} + GRPCClient() : ok_(true) {} virtual ~GRPCClient(); bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, @@ -221,7 +221,7 @@ class GRPCClient : public RPCClient { void AsyncSendEndPass(const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + bool Wait() override; void SendBeginPass() override; @@ -247,6 +247,7 @@ class GRPCClient : public RPCClient { std::mutex sync_mutex_; std::condition_variable sync_cond_; std::atomic req_count_{0}; + bool ok_; // mutex for GetChannel thread safety std::mutex chan_mutex_; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 6479d3a97bafba37b74a1d1c04852a6e60e01be8..4d87376fbf776e29156b78d826f5012bc53460df 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -72,7 +72,7 @@ class RPCClient { virtual void SendBeginPass() = 0; virtual void SendEndPass() = 0; - virtual void Wait() = 0; + virtual bool Wait() = 0; template static RPCClient* GetInstance() { diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a91e0f520e93c01bc5af09b691af2d5a6deda9f2 --- /dev/null +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fake_quantize_op.h" +#include + +namespace paddle { +namespace operators { + +class FakeQuantizeOp : public framework::OperatorWithKernel { + public: + FakeQuantizeOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of FakeQuantizeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FakeQuantizeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"), + "OutMovingScale(Out) of FakeQuantizeOp should not be null"); + // if (ctx->HasInput("InMovingScale")) { + ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale")); + //} + // if (ctx->HasInput("InScales")) { + PADDLE_ENFORCE(ctx->HasOutput("OutScales"), + "OutScales(Out) of FakeQuantizeOp should not be null"); + ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales")); + // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0], + // ctx->Outputs("OutScales")[0], + // "Mean and MeanOut should share the same memory"); + //} + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input tensor of scale operator."); + AddInput("InScales", "(Tensor) scale buffer, used in static quantization.") + .AsDispensable(); + AddInput("InMovingScale", "Last scale, used in static quantization.") + .AsDispensable(); + AddInput("InCurrentIter", + "Last iteration number, used in static quantization.") + .AsDispensable(); + AddOutput("Out", "(Tensor) Output of quantized low level tensor."); + AddOutput("OutScales", + "(Tensor) scale buffer, used in static quantization.") + .AsDispensable(); + AddOutput("OutMovingScale", " Current scale"); + AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable(); + AddAttr("quantize_type", + "(string, default abs_max)" + "The scaling tpe of the quantize operator.") + .SetDefault("abs_max"); + AddAttr("window_size", "(int, default 10000)").SetDefault(10000); + AddAttr("bit_length", "(int, default 8)") + .SetDefault(8) + .AddCustomChecker([](const int &bit_length) { + PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, + "'bit_length' should be between 1 and 16."); + }); + AddAttr("is_test", "").SetDefault(false); + AddComment(R"DOC( +FakeQuantize operator + +quantize_type = abs_max: + + $$scale = max(abs(x))$$ + +quantize_type = range_abs_max: + + $$scale = max(max(abs(x)), history_abs_max)$$ + +quantize_type = moving_average_abs_max: + + $$scale = 0.1*scale+0.9*new_abs_max)$$ + +$$Out = scale*X$$ + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + fake_quantize, + ops::FakeQuantizeKernel, + ops::FakeQuantizeKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..be0c6730a5119090600a27c66510b2a095c54583 --- /dev/null +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -0,0 +1,272 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/fake_quantize_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) { + int bid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x; + + extern __shared__ T shared_max_data[]; + if (gridDim.x > 1) { + shared_max_data[tid] = T(0); + for (int i = bid; i < n; i += blockDim.x * gridDim.x) { + T tmp = fabs(in[i]); + if (tmp > shared_max_data[tid]) { + shared_max_data[tid] = tmp; + } + } + } else { + if (bid < n) { + shared_max_data[tid] = fabs(in[bid]); + } else { + shared_max_data[tid] = T(0); + } + } + __syncthreads(); + + for (int i = blockDim.x / 2; i > 0; i >>= 1) { + if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) { + shared_max_data[tid] = shared_max_data[tid + i]; + } + __syncthreads(); + } + if (tid == 0) { + out[blockIdx.x] = shared_max_data[0]; + } +} + +float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array, + int length) { + float host_max; + int kNumTheads = 1024; + int gridDimx = (kNumTheads - 1 + length) / kNumTheads; + gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx; + framework::Tensor t; + float* device_max = t.mutable_data(framework::make_ddim({gridDimx}), + platform::CUDAPlace()); + FindAbsMaxKernel<<>>(length, array, device_max); + FindAbsMaxKernel< + float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>( + gridDimx, device_max, device_max); + PADDLE_ENFORCE_EQ( + cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost), + cudaSuccess, "cudaMemcpy failed"); + return host_max; +} + +template +__global__ void ApplySaturateKernel(const int n, const T* in, T* out, + int* num_saturate, const T min, + const T max) { + int bid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x; + + extern __shared__ int shared_count[]; + shared_count[tid] = 0; + for (int i = bid; i < n; i += blockDim.x * gridDim.x) { + if (in[i] > max) { + out[i] = max; + shared_count[tid] += 1; + } else if (in[i] < min) { + out[i] = min; + shared_count[tid] += 1; + } else { + out[i] = in[i]; + } + } + __syncthreads(); + + for (int i = blockDim.x / 2; i > 0; i >>= 1) { + if (tid < i) { + shared_count[tid] += shared_count[tid + i]; + } + __syncthreads(); + } + if (tid == 0) { + num_saturate[blockIdx.x] = shared_count[0]; + } +} + +template +__global__ void ReduceKernel(const int n, const T* in, T* out) { + int tid = threadIdx.x; + extern __shared__ T shared_sum[]; + if (tid < n) { + shared_sum[tid] = in[tid]; + } else { + shared_sum[tid] = T(0); + } + __syncthreads(); + // blockDim.x must >= n + for (int i = (n + 1) / 2; i > 0; i >>= 1) { + if (tid < i) { + shared_sum[tid] += shared_sum[tid + i]; + } + __syncthreads(); + } + if (tid == 0) { + out[0] = shared_sum[0]; + } +} + +template +int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n, + const T* in, T* out, const T min, const T max) { + int host_num_saturate; + int kNumTheads = 1024; + int gridDimx = (n + kNumTheads - 1) / kNumTheads; + gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx; + framework::Tensor t; + int* device_num_saturate = t.mutable_data( + framework::make_ddim({gridDimx}), platform::CUDAPlace()); + ApplySaturateKernel< + T><<>>( + n, in, out, device_num_saturate, min, max); + ReduceKernel<<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>( + gridDimx, device_num_saturate, device_num_saturate); + PADDLE_ENFORCE_EQ(cudaSuccess, + cudaMemcpy(&host_num_saturate, device_num_saturate, + sizeof(int), cudaMemcpyDeviceToHost), + "cudaMemcpy failed"); + return host_num_saturate; +} + +template +class FakeQuantizeCUDAKernel : public framework::OpKernel { + public: + T FindRangeAbsMax(const platform::CUDADeviceContext& ctx, + framework::Tensor* scale_list, framework::Tensor* out_scale, + const T& cur_scale, int window_size, + int current_iter) const { + T* sl = scale_list->mutable_data(platform::CPUPlace()); + T remove_tmp = sl[current_iter]; + sl[current_iter] = cur_scale; + T& max_scale = out_scale->mutable_data(platform::CPUPlace())[0]; + if (max_scale < cur_scale) { + max_scale = cur_scale; + } else if (fabs(remove_tmp - max_scale) < 1e-6) { + int size = (current_iter > window_size) ? window_size : current_iter; + max_scale = T(FindAbsMaxGpu(ctx, scale_list->data(), size)); + } + return max_scale; + } + + T FindMovingAverageAbsMmax(framework::Tensor* in_scale, + framework::Tensor* out_scale, + const T& cur_scale) const { + T* ins = in_scale->mutable_data(platform::CPUPlace()); + T* outs = out_scale->mutable_data(platform::CPUPlace()); + outs[0] = 0.9 * cur_scale + 0.1 * ins[0]; + return T(outs[0]); + } + + virtual void Compute(const framework::ExecutionContext& context) const { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto& device_ctx = context.cuda_device_context(); + auto* tensor = context.Output("Out"); + auto* in = context.Input("X"); + const bool is_test = context.Attr("is_test"); + tensor->mutable_data(in->place()); + context.Output("OutMovingScale") + ->mutable_data( + context.Input("InMovingScale")->place()); + auto quantize_type = + static_cast(context.Attr("quantize_type")); + if (quantize_type == std::string("range_abs_max")) { + context.Output("OutScales") + ->mutable_data( + context.Input("InScales")->place()); + context.Output("OutCurrentIter") + ->mutable_data( + context.Input("InCurrentIter")->place()); + } + + T scale = T(1); + int window_size = context.Attr("window_size"); + T bin_cnt = (T)((1 << (context.Attr("bit_length") - 1)) - 1); + if (quantize_type == std::string("abs_max")) { + auto* saving_scale = context.Output("OutMovingScale"); + scale = (T)FindAbsMaxGpu(device_ctx, in->data(), in->numel()); + saving_scale->mutable_data(platform::CPUPlace())[0] = scale; + + auto& device_ctx = context.template device_context(); + auto* scale_list = context.Output("OutScales"); + math::SetConstant scalar; + scale_list->mutable_data(context.GetPlace()); + scalar(device_ctx, scale_list, static_cast(0)); + auto* iter = context.Output("OutCurrentIter"); + iter->mutable_data(context.GetPlace()); + scalar(device_ctx, iter, static_cast(0)); + } else if (quantize_type == std::string("range_abs_max")) { + auto* moving_scale = const_cast( + context.Input("InMovingScale")); + if (is_test) { + scale = moving_scale->mutable_data(platform::CPUPlace())[0]; + } else { + auto* it = const_cast( + context.Input("InCurrentIter")); + auto* iter = context.Output("OutCurrentIter"); + int* last_iter = it->mutable_data(platform::CPUPlace()); + int* current_iter = iter->mutable_data(platform::CPUPlace()); + auto* scale_list = context.Output("OutScales"); + auto* saving_scale = + context.Output("OutMovingScale"); + scale = (T)FindAbsMaxGpu(device_ctx, in->data(), in->numel()); + scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale, + window_size, current_iter[0]); + (*current_iter) = (*last_iter) + 1; + } + } else if (quantize_type == std::string("moving_average_abs_max")) { + auto* moving_scale = const_cast( + context.Input("InMovingScale")); + if (is_test) { + scale = moving_scale->mutable_data(platform::CPUPlace())[0]; + } else { + scale = (T)FindAbsMaxGpu(device_ctx, in->data(), in->numel()); + auto* saving_scale = + context.Output("OutMovingScale"); + scale = FindMovingAverageAbsMmax( + const_cast(moving_scale), saving_scale, scale); + } + } + + ApplySaturateGpu(device_ctx, in->numel(), in->data(), + tensor->mutable_data(in->place()), -scale, scale); + scale = bin_cnt / scale; + + auto& dev = + *context.template device_context().eigen_device(); + auto eigen_out = framework::EigenVector::Flatten(*tensor); + auto eigen_in = framework::EigenVector::Flatten(*tensor); + eigen_out.device(dev) = (scale * eigen_in).round(); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(fake_quantize, + paddle::operators::FakeQuantizeCUDAKernel< + paddle::platform::CUDADeviceContext, float>, + paddle::operators::FakeQuantizeCUDAKernel< + paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h new file mode 100644 index 0000000000000000000000000000000000000000..80f71d85dde39f773cc489fb79effcc775c5010a --- /dev/null +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using platform::Transform; + +template +class FakeQuantizeKernel : public framework::OpKernel { + public: + T FindAbsMax(framework::Tensor* in, int n) const { + T* p = in->mutable_data(platform::CPUPlace()); + T abs_max = (T)0.00000001; + for (int i = 0; i < n; i++) { + T tmp = fabs(p[i]); + if (tmp > abs_max) abs_max = tmp; + } + return T(abs_max); + } + T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale, + const T& cur_scale, int window_size, + int current_iter) const { + T* sl = scale_list->mutable_data(platform::CPUPlace()); + T remove_tmp = sl[current_iter]; + sl[current_iter] = cur_scale; + T& max_scale = out_scale->mutable_data(platform::CPUPlace())[0]; + if (max_scale < cur_scale) { + max_scale = cur_scale; + } else if (fabs(remove_tmp - max_scale) < 1e-6) { + int size = (current_iter > window_size) ? window_size : current_iter; + max_scale = T(FindAbsMax(scale_list, size)); + } + return max_scale; + } + + T FindMovingAverageAbsMmax(framework::Tensor* in_scale, + framework::Tensor* out_scale, + const T& cur_scale) const { + T* ins = in_scale->mutable_data(platform::CPUPlace()); + T* outs = out_scale->mutable_data(platform::CPUPlace()); + outs[0] = 0.9 * cur_scale + 0.1 * ins[0]; + return T(outs[0]); + } + + virtual void Compute(const framework::ExecutionContext& context) const { + auto* tensor = context.Output("Out"); + auto* in = context.Input("X"); + const bool is_test = context.Attr("is_test"); + tensor->mutable_data(in->place()); + + auto* oms_tensor = context.Output("OutMovingScale"); + oms_tensor->mutable_data(in->place()); + + auto quantize_type = + static_cast(context.Attr("quantize_type")); + if (quantize_type == std::string("range_abs_max")) { + auto* oss_tensor = context.Output("OutScales"); + oss_tensor->mutable_data( + context.Input("InScales")->place()); + auto* oci_tensor = context.Output("OutCurrentIter"); + oci_tensor->mutable_data( + context.Input("InCurrentIter")->place()); + } + + T scale = static_cast(1); + int window_size = context.Attr("window_size"); + int bit_length = context.Attr("bit_length"); + int bin_cnt = std::pow(2, bit_length - 1) - 1; + + auto& dev = + *context.template device_context().eigen_device(); + auto raw_in = framework::EigenVector::Flatten(*in); + if (quantize_type == std::string("abs_max")) { + auto* saving_scale = context.Output("OutMovingScale"); + auto scale_out = framework::EigenVector::Flatten(*saving_scale); + scale_out.device(dev) = raw_in.abs().maximum(); + scale = scale_out(0); + + auto& device_ctx = context.template device_context(); + auto* scale_list = context.Output("OutScales"); + math::SetConstant scalar; + scale_list->mutable_data(context.GetPlace()); + scalar(device_ctx, scale_list, static_cast(0)); + auto* iter = context.Output("OutCurrentIter"); + iter->mutable_data(context.GetPlace()); + scalar(device_ctx, iter, static_cast(0)); + } else if (quantize_type == std::string("range_abs_max")) { + auto* moving_scale = context.Input("InMovingScale"); + if (is_test) { + scale = moving_scale->data()[0]; + } else { + auto* it = context.Input("InCurrentIter"); + auto* iter = context.Output("OutCurrentIter"); + const int* last_iter = it->data(); + int* current_iter = iter->mutable_data(platform::CPUPlace()); + auto* scale_list = context.Output("OutScales"); + auto* saving_scale = + context.Output("OutMovingScale"); + auto scale_out = framework::EigenVector::Flatten(*saving_scale); + scale_out.device(dev) = raw_in.abs().maximum(); + scale = saving_scale->mutable_data(platform::CPUPlace())[0]; + scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size, + current_iter[0]); + saving_scale->mutable_data(platform::CPUPlace())[0] = scale; + (*current_iter) = (*last_iter) + 1; + } + } else if (quantize_type == std::string("moving_average_abs_max")) { + auto* moving_scale = context.Input("InMovingScale"); + if (is_test) { + scale = moving_scale->data()[0]; + } else { + auto* saving_scale = + context.Output("OutMovingScale"); + auto scale_out = framework::EigenVector::Flatten(*saving_scale); + scale_out.device(dev) = raw_in.abs().maximum(); + scale = saving_scale->mutable_data(platform::CPUPlace())[0]; + scale = FindMovingAverageAbsMmax( + const_cast(moving_scale), saving_scale, scale); + saving_scale->mutable_data(platform::CPUPlace())[0] = scale; + } + } + + Transform trans; + trans(context.template device_context(), in->data(), + in->data() + in->numel(), tensor->mutable_data(in->place()), + ClipFunctor(-scale, scale)); + auto eigen_out = framework::EigenVector::Flatten(*tensor); + auto eigen_in = framework::EigenVector::Flatten(*tensor); + eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round(); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 02beb80fc8a9f451393dcdd54492c4f88f908497..680fde19eefe57475b7526ebc29d4ff977a16977 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (auto& ep : eps) { VLOG(3) << "fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dadd054b9a6f8d44f4e5832888052bffde34c827 --- /dev/null +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -0,0 +1,167 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" +#include + +namespace paddle { +namespace operators { + +/** + * Organize the classes into a binary tree. At each node, a sigmoid function + * is used to calculate the probability of belonging to the right branch. + * This idea is from "F. Morin, Y. Bengio (AISTATS 05): + * Hierarchical Probabilistic Neural Network Language Model." + * + * Here we uses a simple way of making the binary tree. + * Assuming the number of classes C = 6, + * The classes are organized as a binary tree in the following way: + * + * @code{.py} + * *-*-*- 2 + * | | |- 3 + * | | + * | |-*- 4 + * | |- 5 + * | + * |-*- 0 + * |- 1 + * @endcode + * + * where * indicates an internal node, and each leaf node represents a class. + * - Node 0 ... C-2 are internal nodes. + * - Node C-1 ... 2C-2 are leaf nodes. + * - Class c is represented by leaf node \f$c+C-1\f$. + * + * We assign an id for each node: + * - the id of root be 0. + * - the left child of a node i is 2*i+1. + * - the right child of a node i is 2*i+2. + * + * It's easy to see that: + * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. + * - the j-th level ancestor of node i is + * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$. + * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$. + * + */ + +class HierarchicalSigmoidOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("PreOut"), + "Output(PreOut) should not be null."); + const int64_t batch_size = ctx->GetInputDim("X")[0]; + std::vector output_shape({batch_size, 1}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace()); + } +}; + +template +class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, required) The input tensor with shape [N, D], " + "where N is the size of mini-batch, and D is the feature size."); + AddInput("W", + "(Tensor, required), The parameters of hierarchical " + "sigmoid operator, each of them is a 2-D tensor, the shape is" + "[num_classes - 1, D]."); + AddInput("Label", + "(Tensor, required), The labels of training data. It's a" + "tensor with shape [N, 1]."); + AddInput("Bias", + "(Tensor, optional), The bias is a tensor with shape" + "[1, num_classes - 1]."); + AddOutput("Out", + "(Tensor, required) The output of hierarchical sigmoid operator." + "The shape is [N, 1]."); + AddOutput("PreOut", + "(Tensor, required) A intermedia 2-D tensor with shape " + "[batch_size, code_length], where code_length represents the " + "maximum path length from root to leaf nodes.") + .AsIntermediate(); + AddAttr("num_classes", "(int, required), The number of classes") + .SetDefault(2); + AddComment(R"DOC( +The hierarchical sigmoid operator organize the classes into a binary tree. +At each node, a sigmoid function is used to calculate the probability of +belonging to the right branch. This idea is from +"F. Morin, Y. Bengio (AISTATS 05): +Hierarchical Probabilistic Neural Network Language Model." + )DOC"); + } +}; + +class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PreOut"), + "Input(Preout) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")), + "Output(W@Grad should not be null.)"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X"))); + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); + } + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, + ops::HierarchicalSigmoidOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp); +REGISTER_OP_CPU_KERNEL( + hierarchical_sigmoid, + ops::HierarchicalSigmoidOpKernel, + ops::HierarchicalSigmoidOpKernel); +REGISTER_OP_CPU_KERNEL( + hierarchical_sigmoid_grad, + ops::HierarchicalSigmoidGradOpKernel, + ops::HierarchicalSigmoidGradOpKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h new file mode 100644 index 0000000000000000000000000000000000000000..64096a717b12ed231344649f5eb76b7e4b9af4a6 --- /dev/null +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/fluid/platform/transform.h" +namespace paddle { +namespace operators { + +template +using EigenMatrix = framework::EigenMatrix; +using platform::Transform; + +template +class HierarchicalSigmoidOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* label = ctx.Input("Label"); + auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); + auto* pre_out = ctx.Output("PreOut"); + size_t num_classes = static_cast(ctx.Attr("num_classes")); + int64_t code_length = math::FindLastSet(num_classes - 1); + int64_t batch_size = in->dims()[0]; + framework::Tensor sum; + auto& dev_ctx = ctx.template device_context(); + auto* pre_out_data = pre_out->mutable_data( + framework::make_ddim({batch_size, code_length}), ctx.GetPlace()); + auto pre_out_mat = EigenMatrix::From(*pre_out); + // Not all class(leaf) nodes' path lengths equal code_length, thus init as + // 0s can avoid out of path's loss. + math::SetConstant zero; + zero(dev_ctx, pre_out, static_cast(0.0)); + auto& place = *ctx.template device_context().eigen_device(); + math::RowwiseSum row_sum; + math::MatrixBitCodeFunctor bit_code(num_classes, label->data()); + + std::vector sum_dims({batch_size, 1UL}); + sum.mutable_data(framework::make_ddim(sum_dims), ctx.GetPlace()); + auto sum_mat = EigenMatrix::From(sum); + out->mutable_data(ctx.GetPlace()); + auto out_mat = framework::EigenVector::Flatten(*out); + if (bias) { + bit_code.Add(pre_out, *bias); + } + bit_code.Mul(pre_out, *w, *in); + // clip to [-40, 40] + Transform trans; + trans(ctx.template device_context(), pre_out_data, + pre_out_data + pre_out->numel(), pre_out_data, + ClipFunctor(static_cast(-40.0), static_cast(40.0))); + bit_code.Sum(*pre_out, out, static_cast(-1)); + // use softrelu to calculate cross entropy + pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); + row_sum(dev_ctx, *pre_out, &sum); + // TODO(guosheng): Subtract the out of path's loss, since not all + // class(leaf) nodes' path lengths equal code_length. But it won't break the + // gradient check since both have the out of path's loss and will cancel out + // each other. + out_mat.device(place) = sum_mat + out_mat; + } +}; + +template +class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + auto* w_grad = ctx.Output(framework::GradVarName("W")); + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + auto* label = ctx.Input("Label"); + auto* pre_out = ctx.Input("PreOut"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + framework::Tensor pre_out_grad; + + pre_out_grad.mutable_data(pre_out->dims(), ctx.GetPlace()); + in_grad->mutable_data(ctx.GetPlace()); + w_grad->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + zero(dev_ctx, w_grad, static_cast(0.0)); + + size_t num_classes = static_cast(ctx.Attr("num_classes")); + math::MatrixBitCodeFunctor bit_code(num_classes, label->data()); + + auto& place = *ctx.template device_context().eigen_device(); + auto pre_out_mat = EigenMatrix::From(*pre_out); + auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); + auto out_grad_mat = EigenMatrix::From(*out_grad); + Eigen::array bcast({{1, static_cast(pre_out_grad.dims()[1])}}); + + // softrelu derivative + pre_out_grad_mat.device(place) = + static_cast(1.0) - static_cast(1.0) / pre_out_mat.exp(); + bit_code.Sub(&pre_out_grad); // the gradient of clip(w * x + b) + pre_out_grad_mat.device(place) = + pre_out_grad_mat * out_grad_mat.broadcast(bcast); + // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to + // be consistent with the clipping in forward. + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code.AddGrad(pre_out_grad, bias_grad); + } + bit_code.MulGradWeight(pre_out_grad, w_grad, *in); + bit_code.MulGradError(pre_out_grad, *w, in_grad); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 0669661d225c664010fce97f0a526b62988b92c5..8efd43928aac994c7630a213f6724e8f50abc7e0 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" +#include #include namespace paddle { @@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel { "Input(X) of Im2SequenceOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of Im2SequenceOp op should not be null."); - auto in_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(in_dim.size(), 4, "Input(X) format must be 4D tensor, eg., NCHW."); + int img_channels = in_dim[1]; auto kernels = ctx->Attrs().Get>("kernels"); auto strides = ctx->Attrs().Get>("strides"); auto paddings = ctx->Attrs().Get>("paddings"); - int batch_size = in_dim[0]; - int img_channels = in_dim[1]; - int img_height = in_dim[2]; - int img_width = in_dim[3]; - - int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], - paddings[2], strides[0]); - int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], - paddings[3], strides[1]); - - ctx->SetOutputDim("Out", {batch_size * output_height * output_width, - img_channels * kernels[0] * kernels[1]}); + ctx->SetOutputDim("Out", + {in_dim[0], img_channels * kernels[0] * kernels[1]}); } }; @@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { "C: channels" "H: height" "W: width"); + AddInput("Y", + "(Tensor) The input tensor of image real size(H, W)." + "2-D with shape [batchsize, 2]") + .AsDispensable(); AddOutput("Out", "(LodTensor) The output data of im2sequence op,"); AddAttr>("kernels", "(vector), the " @@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { "(vector default:{0, 0, 0, 0}), the " "paddings(up_pad, left_pad, down_pad, right_pad)") .SetDefault({0, 0, 0, 0}); + AddAttr>("out_stride", + "the attribute is valid only when input(Y)" + "is not NULL.this attribute represents the" + "scaling of the pic through the CNN" + "(vector dedault:{1,1}),the out_stride" + " (out_stride_height, out_stride_width)") + .SetDefault({1, 1}); AddComment(R"DOC( This op uses kernels to scan images and converts these images to sequences. After expanding, The number of time steps are output_height * output_width @@ -123,7 +126,7 @@ output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] [ 7. 1. 7. 9. 2. 1. 3. 5.] [ 5. 7. 2. 4. 1. 3. 9. 0.] [ 7. 9. 4. 8. 3. 5. 0. 8.]] -output.dims = {8, 9} +output.dims = {8, 8} output.lod = [[0, 4, 8]] )DOC"); diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index d792c68f784d8ffec0eb303a6ab9b59c9f121fa7..4a9942819414d552eb69bd0b30b66aab76a2dbf4 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -13,6 +13,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" @@ -39,50 +40,107 @@ class Im2SequenceKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const Tensor* in = ctx.Input("X"); LoDTensor* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - // TODO(wanghaoshuang): Add layout checker after 'set_layout' - // being available for python API - // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW, - // "Input(X) layout must be NCHW"); auto in_dim = in->dims(); int batch_size = in_dim[0]; int img_channels = in_dim[1]; int img_height = in_dim[2]; int img_width = in_dim[3]; - auto kernels = ctx.Attr>("kernels"); auto strides = ctx.Attr>("strides"); auto paddings = ctx.Attr>("paddings"); - int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], - paddings[2], strides[0]); - int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], - paddings[3], strides[1]); - - const std::vector dilations({1, 1}); - - auto out_dims = out->dims(); - out->Resize({batch_size, out->numel() / batch_size}); - for (int i = 0; i < batch_size; i++) { - const Tensor src = - in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); - Tensor dst = out->Slice(i, i + 1).Resize( - {output_height, output_width, img_channels, kernels[0], kernels[1]}); - - math::Im2ColFunctor f; - auto& dev_ctx = ctx.template device_context(); - f(dev_ctx, src, dilations, strides, paddings, &dst); - } - out->Resize(out_dims); - - // set lod information - // TODO(wanghaoshuang): Move this to InferShape - framework::LoD lod(1); - lod[0].reserve(batch_size + 1); - for (int i = 0, offset = 0; i < batch_size + 1; ++i) { + if (ctx.HasInput("Y") && batch_size > 1) { + const Tensor* imgrealsize = ctx.Input("Y"); + auto out_stride = ctx.Attr>("out_stride"); + Tensor cpu_shape_tensor; + TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor); + std::vector imgreal_h; + std::vector imgreal_w; + std::vector output_height; + std::vector output_width; + int result = 0; + for (int i = 0; i < batch_size; i++) { + int tmp_real_h = static_cast((cpu_shape_tensor.data())[2 * i]); + int tmp_real_w = + static_cast((cpu_shape_tensor.data())[2 * i + 1]); + if (tmp_real_h % out_stride[0] == 0) { + tmp_real_h = tmp_real_h / out_stride[0]; + } else { + tmp_real_h = tmp_real_h / out_stride[0] + 1; + } + if (tmp_real_w % out_stride[1] == 0) { + tmp_real_w = tmp_real_w / out_stride[1]; + } else { + tmp_real_w = tmp_real_w / out_stride[1] + 1; + } + imgreal_h.push_back(tmp_real_h); + imgreal_w.push_back(tmp_real_w); + output_height.push_back(Im2SeqOutputSize( + imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0])); + output_width.push_back(Im2SeqOutputSize( + imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1])); + result += output_height[i] * output_width[i]; + } + + out->mutable_data({result, img_channels * kernels[0] * kernels[1]}, + ctx.GetPlace()); + + const std::vector dilations({1, 1}); + int offset_out = 0; + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = out->Slice(offset_out, + offset_out + output_height[i] * output_width[i]) + .Resize({output_height[i], output_width[i], + img_channels, kernels[0], kernels[1]}); + offset_out += output_height[i] * output_width[i]; + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + int offset = 0; + lod[0].push_back(offset); + for (int i = 0; i < batch_size; ++i) { + offset += output_height[i] * output_width[i]; + lod[0].push_back(offset); + } + out->set_lod(lod); + } else { + int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], + paddings[2], strides[0]); + int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], + paddings[3], strides[1]); + out->mutable_data({batch_size * output_height * output_width, + img_channels * kernels[0] * kernels[1]}, + ctx.GetPlace()); + const std::vector dilations({1, 1}); + auto out_dims = out->dims(); + out->Resize({batch_size, out->numel() / batch_size}); + for (int i = 0; i < batch_size; i++) { + const Tensor src = + in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); + Tensor dst = + out->Slice(i, i + 1).Resize({output_height, output_width, + img_channels, kernels[0], kernels[1]}); + + math::Im2ColFunctor f; + auto& dev_ctx = ctx.template device_context(); + f(dev_ctx, src, dilations, strides, paddings, &dst); + } + out->Resize(out_dims); + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + int offset = 0; lod[0].push_back(offset); - offset += output_height * output_width; + for (int i = 0; i < batch_size; ++i) { + offset += output_height * output_width; + lod[0].push_back(offset); + } + out->set_lod(lod); } - out->set_lod(lod); } }; diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 56e39649b409f7eed108027f6df58c19dd3c8ab8..438b44b42aaf4c7e3ff05a5f7c52bbfd850e92c7 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -61,6 +61,8 @@ static void ParallelExecuteBlocks( framework::Async([&executor, &prepared, &program, &scope, idx]() { int run_block = idx; // thread local try { + VLOG(3) << "running server block: " << run_block + << "pointer: " << prepared[run_block].get(); executor->RunPreparedContext(prepared[run_block].get(), scope); } catch (const std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); @@ -107,12 +109,14 @@ void ListenAndServOp::RunSyncLoop( PADDLE_ENFORCE_GE(num_blocks, 2, "server program should have at least 2 blocks"); - std::vector optimize_blocks_idx; - for (auto blk : optimize_blocks) { - optimize_blocks_idx.push_back(blk->ID()); + // Prepare all the server block + std::vector optimize_blocks_list; + for (size_t i = 1; i < program->Size(); ++i) { + optimize_blocks_list.push_back(i); } - auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx); - // Insert placeholder for block0 which holds current op itself. + auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list); + // Insert placeholder for block0 which holds current op itself, + // NOTE the first block in `optimize_prepared` should never be ran. optimize_prepared.insert( optimize_prepared.begin(), std::shared_ptr(nullptr)); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 5571ff9a7151c1f971ad1805bf001815a651202b..d2b772d11379c218be77277b89f3ded7b59ab9f3 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -51,6 +51,7 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) +math_library(matrix_bit_code) math_library(unpooling) math_library(vol2col) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f6c1e5c35f02cd4bc729eea78b17fac017aa90e..70f88f24f682e05972ca73ef7b50f96be50d1ef4 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -21,6 +21,10 @@ #include "paddle/fluid/platform/dynload/mklml.h" #endif +#ifdef PADDLE_WITH_LIBXSMM +#include +#endif + #ifdef PADDLE_USE_OPENBLAS #include #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 2ce94cfc93823aa891114ef8fd1e851727ebc623..238bd3f8def9eaa6c18afdab1031c4babfde8ae2 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#include #include #include "paddle/fluid/operators/math/math_function.h" @@ -30,6 +31,12 @@ struct CBlas { platform::dynload::cblas_sgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_sgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_saxpy(args...); @@ -63,6 +70,12 @@ struct CBlas { platform::dynload::cblas_dgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_dgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_daxpy(args...); @@ -140,6 +153,9 @@ struct CBlas { template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } + static void SMM_GEMM(...) { + PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); + } #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -147,6 +163,33 @@ struct CBlas { #endif }; +template +inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa, + bool transb, const T &alpha, const T &beta) { +#ifdef PADDLE_WITH_LIBXSMM + // Refer to https://github.com/hfp/libxsmm/blob/master/README.md + // But the threshold is custom + constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; + if (m * n * k > LIBXSMM_THRESHOLD || transa || transb || + std::abs(alpha - static_cast(1) > + std::numeric_limits::epsilon()) || + std::abs(beta) > std::numeric_limits::epsilon()) { + return false; + } else { + return true; + } +#endif + return false; +} + +template <> +inline bool UseXSMM(const int &m, const int &n, const int &k, + bool transa, bool transb, + const platform::float16 &alpha, + const platform::float16 &beta) { + return false; +} + template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -156,8 +199,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, + beta)) { + // Note: SMM use ColMajor + const char transa = 'N'; + const char transb = 'N'; + CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, + &beta, C, &ldc); + } else { +#endif + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + } +#endif } template <> diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 336d6febc2ce3a55e82ed613bbc1081101f822f0..a50b9ace39249f4f899a46e171bbdced033b46bc 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -43,21 +43,6 @@ class Im2ColFunctordims()[3]; int col_width = col->dims()[4]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - ((dilation[0] * (filter_height - 1) + 1))) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - ((dilation[1] * (filter_width - 1) + 1))) / - stride[1] + - 1, - col_width, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); @@ -178,17 +163,6 @@ class Im2ColFunctordims()[0]; int col_width = col->dims()[1]; - PADDLE_ENFORCE_EQ( - (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ( - (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - const T* im_data = im.data(); T* col_data = col->data(); diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index eecb233d22cea06da016b2671fd606b70eddf5a5..4897767f4d88d9e079f05c921153923c4eb354b0 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -77,21 +77,6 @@ class Im2ColFunctordims()[3]; int col_width = col->dims()[4]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - (dilation[0] * (filter_height - 1) + 1)) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - (dilation[1] * (filter_width - 1) + 1)) / - stride[1] + - 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - int num_outputs = im_channels * col_height * col_width; int blocks = (num_outputs + 1024 - 1) / 1024; int block_x = 512; @@ -274,21 +259,6 @@ class Im2ColFunctordims()[0]; int col_width = col->dims()[1]; - PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] - - (dilation[0] * (filter_height - 1) + 1)) / - stride[0] + - 1, - col_height, - "Output_height and padding(padding_up, padding_down) are " - "inconsistent."); - PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] - - (dilation[1] * (filter_width - 1) + 1)) / - stride[1] + - 1, - col_width, - "col_width and padding(padding_left, padding_right) are " - "inconsistent."); - int block_dim_x = 0; int block_dim_y = 0; if (filter_height <= 4 && filter_width <= 4) { diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index b9bd49d77d935e985705f78402ffe1ea90f24cb3..895a7019aa10e5d9bb8f0c17e433a4344eac3bf4 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -155,7 +155,7 @@ class RowwiseSum { PADDLE_ENFORCE_EQ(in_dims.size(), 2U); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), size); + PADDLE_ENFORCE_EQ(out->numel(), height); T* out_buf = out->mutable_data(out->place()); const T* in_buf = input.data(); diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index b545671b43d3a453ab03e4774427179617f62db0..078dd448c385dbb8a00025ee2ba08d0c41a4730a 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) { EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); } +#ifdef PADDLE_WITH_LIBXSMM +template +void MklSmmCompare(int m, int n, int k) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor mat_b; + paddle::framework::Tensor mat_c_smm; + paddle::framework::Tensor mat_c_mkl; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* A = mat_a.mutable_data({m, k}, *cpu_place); + T* B = mat_b.mutable_data({k, n}, *cpu_place); + T* CSMM = mat_c_smm.mutable_data({m, n}, *cpu_place); + T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); + T alpha = static_cast(1); + T beta = static_cast(0); + for (int i = 0; i < mat_a.numel(); ++i) { + A[i] = static_cast(i); + } + for (int i = 0; i < mat_b.numel(); ++i) { + B[i] = static_cast(i); + } + // lda,ldb,ldc follow RowMajor + int lda = k; + int ldb = n; + int ldc = n; + + auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + const char transa = 'N'; + const char transb = 'N'; + paddle::operators::math::CBlas::SMM_GEMM(&transa, &transb, &n, &m, &k, + &alpha, B, &ldb, A, &lda, &beta, + CSMM, &ldc); + }; + + auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + paddle::operators::math::CBlas::GEMM(CblasRowMajor, CblasNoTrans, + CblasNoTrans, m, n, k, alpha, A, + lda, B, ldb, beta, CMKL, ldc); + }; + + smm(); + mkl(); + ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel()); + for (int i = 0; i < mat_c_mkl.numel(); ++i) { + EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]); + } +} +TEST(math_function, gemm_mkl_vs_smm) { + MklSmmCompare(1, 2, 3); + MklSmmCompare(1, 2, 3); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 8, 5); + MklSmmCompare(3, 8, 5); +} +#endif -TEST(math_function, gemm_trans_clbas) { +TEST(math_function, gemm_trans_cblas) { paddle::framework::Tensor input1; paddle::framework::Tensor input2; paddle::framework::Tensor input3; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc new file mode 100644 index 0000000000000000000000000000000000000000..1e56e297396c6e37867a53f039478191f0caf08e --- /dev/null +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -0,0 +1,176 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include +namespace paddle { +namespace operators { +namespace math { + +template +void MatrixBitCodeFunctor::Add(framework::Tensor* tmat, + const framework::Tensor& vec) { + SimpleCodeTable code_table(num_classes_); + size_t batch_size = tmat->dims()[0]; + size_t width = tmat->dims()[1]; + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + tmat->data()[i * width + j] += vec.data()[index]; + } + } +} + +template +void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, + framework::Tensor* vec) { + SimpleCodeTable code_table(num_classes_); + size_t batch_size = tmat.dims()[0]; + size_t width = tmat.dims()[1]; + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + vec->data()[index] += tmat.data()[i * width + j]; + } + } +} + +template +void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, + framework::Tensor* sum, T scale_sum) { + SimpleCodeTable code_table(num_classes_); + size_t num_samples = tmat.dims()[0]; + size_t o_width = tmat.dims()[1]; + for (size_t i = 0; i < num_samples; ++i) { + T sm = static_cast(0.0); + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + // calc_bit starts from right most bit, while data in tmat[i] is in the + // reverse order. + sm += tmat.data()[i * o_width + j]; + } + } + sum->data()[i] = scale_sum * sm; + } +} + +template +void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, + const framework::Tensor& weight, + const framework::Tensor& input) { + SimpleCodeTable code_table(num_classes_); + size_t num_samples = tmat->dims()[0]; + size_t tmat_width = tmat->dims()[1]; + size_t input_width = input.dims()[1]; + size_t weight_width = weight.dims()[1]; + auto tmat_value = tmat->data(); + auto weight_value = weight.data(); + auto input_value = input.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + T sum = static_cast(0.0); + for (size_t k = 0; k < input_width; ++k) { + sum += weight_value[weight_width * index + k] * + input_value[input_width * i + k]; + } + tmat_value[i * tmat_width + j] += sum; + } + } +} + +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, + framework::Tensor* weight, + const framework::Tensor& input) { + SimpleCodeTable code_table(num_classes_); + size_t num_samples = tmat.dims()[0]; + size_t input_width = input.dims()[1]; + size_t tmat_width = tmat.dims()[1]; + size_t weight_width = weight->dims()[1]; + auto tmat_value = tmat.data(); + auto weight_value = weight->data(); + auto input_value = input.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + weight_value[weight_width * index + k] += + tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; + } + } + } +} + +template +void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, + const framework::Tensor& weight, + framework::Tensor* input) { + SimpleCodeTable code_table(num_classes_); + size_t num_samples = tmat.dims()[0]; + size_t tmat_width = tmat.dims()[1]; + size_t input_width = input->dims()[1]; + size_t weight_width = weight.dims()[1]; + auto tmat_value = tmat.data(); + auto weight_value = weight.data(); + auto input_value = input->data(); + + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code.calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + input_value[input_width * i + k] += + tmat_value[i * tmat_width + j] * + weight_value[weight_width * index + k]; + } + } + } +} + +template +void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { + SimpleCodeTable code_table(num_classes_); + size_t num_samples = tmat->dims()[0]; + size_t o_width = tmat->dims()[1]; + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table(static_cast(ids_[i])); + int code_length = code.get_length(); + for (int j = 0; j < code_length; ++j) { + if (code.calc_bit(j)) { + tmat->data()[i * o_width + j] -= 1; + } + } + } +} + +template class MatrixBitCodeFunctor; +template class MatrixBitCodeFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h new file mode 100644 index 0000000000000000000000000000000000000000..5454d58f371afb5f5d6a1c3208318f80d4e0aa36 --- /dev/null +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +/** + * SimpleCodeTable class should support 3 functions: + * + * size_t size() + * return the number of ids + * + * int get_max_code_length() + * return the maximal code length + * + * SimpleCode operator()(size_t i) + * return the i-th code. Code class is descriebed below. + * + * SimpleCode class should support 3 functions: + * + * int get_length() + * return the length of the code + * + * size_t cal_index(int bit) + * bit ranges from 0 to get_length() - 1 + * return the index for the (1+bit) level parent + * + * bool calc_bit(int bit) + * return true if the bit level parent is the right child of (1+bit) level + * parent + * + */ + +/** + * return the 1-based index of the highest bit set + * + * for x > 0: + * \f[ + * FindLastSet(x) = 1 + \floor*{\log_{2}x} + * \f] + */ +inline constexpr size_t FindLastSet(size_t x) { + return std::is_same::value + ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0) + : (std::is_same::value // NOLINT + ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) + : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); +} + +struct SimpleCode { + SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} + /** + * Here the id of root shoud be 1 rather than 0, thus the encoding of class c + * is `c + num_classes` and all siblings can get the same weight indice using + * prefixes. + * Weight index is the prefixes of encoding, thus leave out the right most + * bit in calc_index. + * Binary classification path is the suffixes of encoding, thus leave out the + * left most bit in calc_bit. + */ + inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; } + inline bool calc_bit(int bit) const { return c_ & (1 << bit); } + inline int get_length() const { return FindLastSet(c_) - 1; } + + private: + size_t c_; +}; + +struct SimpleCodeTable { + explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {} + SimpleCode operator()(size_t code) const { + return SimpleCode(code, num_classes_); + } + size_t size() const { return num_classes_; } + int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } + + private: + size_t num_classes_; +}; + +template +class MatrixBitCodeFunctor { + public: + explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) + : num_classes_(num_classes), ids_(ids) {} + /* For j < code_length + tmat(i, j) += vec(0, index(i, j)) + */ + void Add(framework::Tensor* tmat, const framework::Tensor& vec); + + /* For j < code_length + vec(0, index(i, j)) += tmat(i, j) + */ + void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); + + /* For j < code_length + sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) + */ + void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum); + + /* For j < code_length + tmat(i, j) -= bit(i, j) + */ + void Sub(framework::Tensor* tmat); + /* For j < code_length + input.row(i) += tmat(i, j) * weight.row(index(i, j)) + */ + void Mul(framework::Tensor* tmat, const framework::Tensor& weight, + const framework::Tensor& input); + + /* For index(i, j) >= 0: + weight.row(index(i, j)) += tmat(i, j) * input.row(i) + */ + void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, + const framework::Tensor& input); + /* For j < code_length + input.row(i) += tmat(i, j) * weight.row(index(i, j)) + */ + void MulGradError(const framework::Tensor& tmat, + const framework::Tensor& weight, framework::Tensor* input); + + size_t num_classes_; + const int64_t* ids_; +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 8734282fe496b8e90af19abd5549566d62316fc3..4b804740a06f9e29704f2b3f58a90191e3559347 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index db8cf3b605c9175eeda4548b1e7c8203f26c5d89..28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -81,6 +81,15 @@ class BlockingQueue { } } + void ReOpen() { + std::lock_guard lock(mutex_); + closed_ = false; + std::deque new_deque; + queue_.swap(new_deque); + send_cv_.notify_all(); + receive_cv_.notify_all(); + } + void Close() { std::lock_guard lock(mutex_); closed_ = true; diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index 1dbafd23e92732bdaf0d263a01e267227786d839..e17c2ffd39eea31fe85933eda144ab97cf8c3dd8 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader { BatchReader(const std::shared_ptr& reader, int batch_size, bool discard_leftover) : DecoratedReader(reader), - batch_size_(batch_size), + batch_size_(static_cast(batch_size)), discard_leftover_(discard_leftover) { buffer_.reserve(batch_size_); } @@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader { void ReadNextImpl(std::vector* out) override; private: - int batch_size_; + size_t batch_size_; bool discard_leftover_; std::vector> buffer_; }; @@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase { void BatchReader::ReadNextImpl(std::vector* out) { buffer_.clear(); buffer_.reserve(batch_size_); - for (int i = 0; i < batch_size_; ++i) { + for (size_t i = 0; i < batch_size_; ++i) { buffer_.push_back(std::vector()); reader_->ReadNext(&buffer_.back()); if (buffer_.back().empty()) { @@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector* out) { // if buffer_ is empty, the 'out' will return as an empty vector. return; } - int out_num = buffer_[0].size(); + size_t out_num = buffer_[0].size(); out->reserve(out_num); - for (int j = 0; j < out_num; ++j) { + for (size_t j = 0; j < out_num; ++j) { // Merge shape and check date type std::type_index batch_type = buffer_[0][j].type(); framework::DDim batch_shape = buffer_[0][j].dims(); diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index d41124279930e92138e7e6a5ab045659a415eb6d..833776f56eef0ffb2ae5e963919f0482bcd511b8 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -27,19 +27,17 @@ class PyReader : public framework::FileReader { queue_ = queue; } - void ReadNextImpl(std::vector* out) override { + void ReadNext(std::vector* out) override { bool success; *out = queue_->Pop(&success); if (!success) out->clear(); } - private: - void ShutdownImpl() override { /* TODO */ - } + void Shutdown() override { queue_->Close(); } - void StartImpl() override { /* TODO */ - } + void Start() override { queue_->ReOpen(); } + private: std::shared_ptr queue_; }; diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 30d962ba10a954a837f9771d21cedf0feb643439..311a429f9c307f3913a1ffe5dfb7d84119c9711e 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -58,12 +58,15 @@ class LoDTensorBlockingQueue { inline size_t Size() const { return queue_.Size(); } - inline void Close() { return queue_.Close(); } + inline void ReOpen() { queue_.ReOpen(); } + + inline void Close() { queue_.Close(); } inline bool IsClosed() const { return queue_.IsClosed(); } private: - void CheckDims(const std::vector& lod_tensor_vec) { + void CheckDims( + const std::vector& lod_tensor_vec) const { PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(), "Expect input size is %d but found %s", dims_.size(), lod_tensor_vec.size()); diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 9854a31f5b10f5ecd940c0d41c2c3e468fc17bad..1ba684014904e61a86bebacd7d29d7e10d313092 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase { rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); } if (sync_mode) { - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 6b4572dcccc21e783f1df0b9bcde11d532ff4ba8..d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase { VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; // need to wait before sending send_barrier message - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); if (sync_mode) { for (auto& ep : eps) { VLOG(3) << "send barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 0cac329aafa8c4c67cae48ba62a48575f5edba92..829f310d4233c01a7fbb9ccf7427f6e47ce8d384 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase { } } if (sync_send) { - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c507baf3a0ab0a557d29a53700685753616193b --- /dev/null +++ b/paddle/fluid/operators/squeeze_op.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class SqueezeOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SqueezeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SqueezeOp should not be null."); + + const auto &x_dims = ctx->GetInputDim("X"); + // Check input tensor dims (<6) Eigen limit. + PADDLE_ENFORCE(x_dims.size() <= 6, + "Invalid dimnesions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)."); + + const auto &axes = ctx->Attrs().Get>("axes"); + for (int a : axes) { + PADDLE_ENFORCE_LT(a, x_dims.size(), + "The squeeze axis should be less than input " + "tensor's rank."); + } + + auto out_dims = GetOutputShape(axes, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + } + + static framework::DDim GetOutputShape(const std::vector squeeze_dims, + const framework::DDim &in_dims) { + size_t num_squeeze_dims = squeeze_dims.size(); + int cnt_squeezed_dims = 0; + bool should_squeeze[9] = {false}; + + // Determines number of dimensions of output tensor after squeeze. + // Mark and count the dimensions need to be squeezed + if (num_squeeze_dims == 0) { + for (int idx = 0; idx < in_dims.size(); ++idx) { + if (in_dims[idx] == 1) { + should_squeeze[idx] = true; + ++cnt_squeezed_dims; + } + } + } else { + for (size_t idx = 0; idx < num_squeeze_dims; ++idx) { + int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size() + : squeeze_dims[idx]; + // Check current index, the upper limit has beed checked in line 36. + PADDLE_ENFORCE(current >= 0, + "Invalid axis, the negative axis is out of range."); + PADDLE_ENFORCE(in_dims[current] == 1, + "Invalid axis index, the axis that will be squeezed " + "should be equal to 1."); + + if (!(should_squeeze[current])) { + ++cnt_squeezed_dims; + } + should_squeeze[current] = true; + } + } + + // Make output dimensions + std::vector output_shape(in_dims.size() - cnt_squeezed_dims, 0); + for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) { + if (!should_squeeze[in_idx]) { + output_shape[out_idx++] = in_dims[in_idx]; + } + } + + return framework::make_ddim(output_shape); + } +}; + +class SqueezeOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axes = Attr>("axes"); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(out_dims); + attrs["inplace"] = Attr("inplace"); + // Invoke Reshape Op + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor). The input tensor of squeeze operator."); + AddOutput("Out", "(Tensor). The output tensor of squeeze operator."); + AddAttr>("axes", + "(std::vector). List of integers," + " indicating the dimensions to squeeze.") + .SetDefault({}); + AddAttr("inplace", + "(default: false) Squeeze the source tensor's shape without " + "memory copy. When Attr(inplace) is set true, the output " + "tensor shares memory with Input(X), otherwise, a new output " + "tensor is created, and its data are copied from Input(x).") + .SetDefault(false); + AddComment(R"DOC( + Squeeze Operator. + + Remove single-dimensional entries from the shape of a tensor. + Takes a parameter axes with a list of axes to squeeze. + If axes is not provided, all the single dimensions will be removed from the shape. + If an axis is selected with shape entry not equal to one, an error is raised. + + Examples: + Case 1: + Given + X.shape = (1, 3, 1, 5) + and + axes = [0] + we get: + Out.shape = (3, 1, 5) + + Case 2: + Given + X.shape = (1, 3, 1, 5) + and + axes = [] + we get: + Out.shape = (3, 5) + )DOC"); + } +}; + +class SqueezeGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim(framework::GradVarName("X"), + context->GetInputDim("X")); + context->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class SqueezeGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(x_dims); + attrs["inplace"] = Attr("inplace"); + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, + attrs); + reshape_op->Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle + +// Tell linker to use reshape op +USE_OP(reshape); + +namespace ops = paddle::operators; +REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, + ops::SqueezeOpInferShape, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape); diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f78d977760f18c9eb1270e515e68acb208a7c9a4..d2035777ee2289291a02594ee289156504df09d9 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { input_format = memory::format::nc; } - for (int i = in_place ? 1 : 0; i < N; i++) { + for (int i = 0; i < N; i++) { PADDLE_ENFORCE(in_vars[i]->IsType(), "all inputs must be all LoDTensors"); auto& input = in_vars[i]->Get(); diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 7ddb82ef6ff063868a4b9b603b8ab89700b9dd13..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel { #endif for (size_t i = 0; i < row; i++) { std::vector> vec; + vec.reserve(col); for (size_t j = 0; j < col; j++) { vec.push_back(std::pair(eg_input(i, j), j)); } diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2a15fdf572e0de30f9949dda5020e130b0c5585 --- /dev/null +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -0,0 +1,191 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class UnsqueezeOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of UnsqueezeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UnsqueezeOp should not be null."); + + const auto &axes = ctx->Attrs().Get>("axes"); + const auto &x_dims = ctx->GetInputDim("X"); + // Validity Check: input tensor dims (<6). + PADDLE_ENFORCE(x_dims.size() <= 6, + "Invalid dimensions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)"); + auto out_dims = GetOutputShape(axes, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + } + + static framework::DDim GetOutputShape(const std::vector unsqz_dims, + const framework::DDim &in_dims) { + int output_size = in_dims.size() + static_cast(unsqz_dims.size()); + int cur_output_size = in_dims.size(); + std::vector output_shape(output_size, 0); + + // Validity Check: rank range. + PADDLE_ENFORCE(output_size <= 6, + "The output tensor's rank should be less than 6."); + + for (int axis : unsqz_dims) { + int cur = axis < 0 ? axis + cur_output_size + 1 : axis; + // Vaildity Check: the axis bound + PADDLE_ENFORCE( + cur >= 0 && cur <= cur_output_size, + "The unsqueeze dims must be within range of current rank."); + // Move old axis, and insert new axis + for (int i = cur_output_size; i >= cur; --i) { + if (output_shape[i] == 1) { + // Move axis + output_shape[i + 1] = 1; + output_shape[i] = 0; + } + } + output_shape[cur] = 1; + // Add the output size. + cur_output_size++; + } + + // Make output shape + for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { + if (output_shape[out_idx] == 0) { + output_shape[out_idx] = in_dims[in_idx++]; + } + } + + return framework::make_ddim(output_shape); + } +}; + +class UnsqueezeOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axes = Attr>("axes"); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(out_dims); + attrs["inplace"] = Attr("inplace"); + // Invoke Reshape op. + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor). The input tensor of unsqueeze operator."); + AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator."); + AddAttr>("axes", + "(std::vector). List of integers," + " indicating the dimensions to be inserted") + .AddCustomChecker([](const std::vector &axes) { + PADDLE_ENFORCE(!axes.empty(), + "Invalid axes, The unsqueeze axes is empty."); + // Validity Check: axes dims (<6). + PADDLE_ENFORCE(static_cast(axes.size()) < 6, + "Invalid dimensions, dynamic dimensions should be " + "within [1, 6] dimensions (Eigen limit)."); + // Validity Check: the range of unsqueeze aixs. + for (int axis : axes) { + PADDLE_ENFORCE(axis < 6, + "Invalid dimensions, input axis should be" + " within [1, 6] dimensions (Eigen limit)."); + } + }); + AddAttr( + "inplace", + "(default: false) Unsqueeze the source tensor's shape without " + "memory copy. When Attr(inplace) is set true, the output " + "tensor shares memory with Input(X), otherwise, a new output " + "tensor is created, and its data are copied from Input(x).") + .SetDefault(false); + AddComment(R"DOC( + Unsqueeze Operator. + + Insert single-dimensional entries to the shape of a tensor. + Takes one required argument axes, a list of dimensions that will be inserted. + Dimension indices in axes are as seen in the output tensor. + + For example: + Given a tensor such that tensor with shape [3, 4, 5], + then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1] + )DOC"); + } +}; + +class UnsqueezeGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class UnsqueezeGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto x_dims = scope.FindVar(Input("X"))->Get().dims(); + + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(x_dims); + attrs["inplace"] = Attr("inplace"); + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, + attrs); + reshape_op->Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle + +// Tell linker to use reshape op. +USE_OP(reshape); + +namespace ops = paddle::operators; +REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, + ops::UnsqueezeOpInferShape, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, + ops::UnsqueezeGradInferShape); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 20037d0764056c2a093af801c9cc1eb788dd46d6..e0d7937ae2f3ce4bda12f3771727e2992d63cb9b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -46,7 +46,7 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS malloc - place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) + place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) cc_test(init_test SRCS init_test.cc DEPS device_context) diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 33fec2c1073819d88d85a8872227adcb9df3e8f4..a8f93e6848a1db1f5aa0ee266a076af2b5d0c964 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -222,15 +222,16 @@ class MKLDNNHandler { static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT const std::string& suffix) { - auto dims2str = [](const mkldnn::memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(operand_dims) + suffix; + }; + + protected: + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; } protected: diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index fcd3356d44ee592233c3883d439d0677714900b8..2199f5311fd3728e624fc222a1b876eb947cc0aa 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -145,14 +145,14 @@ void BindBlockDesc(pybind11::module *m) { .def_property_readonly("id", &pd::BlockDesc::ID) .def_property_readonly("parent", &pd::BlockDesc::Parent) .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID) - .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID) + .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID) .def("append_op", &pd::BlockDesc::AppendOp, pybind11::return_value_policy::reference) - .def("prepend_op", &pd::BlockDesc::PrependOp, + .def("_prepend_op", &pd::BlockDesc::PrependOp, pybind11::return_value_policy::reference) - .def("insert_op", &pd::BlockDesc::InsertOp, + .def("_insert_op", &pd::BlockDesc::InsertOp, pybind11::return_value_policy::reference) - .def("remove_op", &pd::BlockDesc::RemoveOp) + .def("_remove_op", &pd::BlockDesc::RemoveOp) .def("var", [](pd::BlockDesc &self, pybind11::bytes byte_name) { std::string name = byte_name; @@ -165,7 +165,7 @@ void BindBlockDesc(pybind11::module *m) { return self.HasVar(name); }, pybind11::return_value_policy::reference) - .def("rename_var", + .def("_rename_var", [](pd::BlockDesc &self, const pybind11::bytes &byte_name, const pybind11::bytes &byte_name_new) { std::string name = byte_name; @@ -189,7 +189,7 @@ void BindBlockDesc(pybind11::module *m) { return self.FindVarRecursive(name); }, pybind11::return_value_policy::reference) - .def("remove_var", + .def("_remove_var", [](pd::BlockDesc &self, pybind11::bytes byte_name) { std::string name = byte_name; return self.RemoveVar(name); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0c523b6f176345c0407b8541c04fb8c3b27f7c60..216c4666c0a311f93f29692b2ca1d17bf9dafab8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include #include +#include #include // NOLINT // for call_once #include #include @@ -66,6 +67,14 @@ bool IsCompiledWithCUDA() { #endif } +bool IsCompiledWithDIST() { +#ifdef PADDLE_WITH_DISTRIBUTE + return true; +#else + return false; +#endif +} + PYBIND11_PLUGIN(core) { py::module m("core", "C++ core of PaddlePaddle"); @@ -78,37 +87,37 @@ PYBIND11_PLUGIN(core) { py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) - .def("get_dims", + .def("_get_dims", [](const Tensor &self) { return vectorize(self.dims()); }) - .def("set_dims", + .def("_set_dims", [](Tensor &self, const std::vector &dim) { self.Resize(make_ddim(dim)); }) - .def("set_layout", + .def("_set_layout", [](Tensor &self, const std::string &layout) { self.set_layout(StringToDataLayout(layout)); }) - .def("alloc_float", + .def("_alloc_float", [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) - .def("alloc_float", + .def("_alloc_float", [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) - .def("alloc_int", + .def("_alloc_int", [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) - .def("alloc_int", + .def("_alloc_int", [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) - .def("alloc_int", + .def("_alloc_int", [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { self.mutable_data(place); }) - .def("alloc_float", + .def("_alloc_float", [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { self.mutable_data(place); }) @@ -136,11 +145,11 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDAPinnedTensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) - .def("set_float_element", TensorSetElement) - .def("get_float_element", TensorGetElement) - .def("set_double_element", TensorSetElement) - .def("get_double_element", TensorGetElement) - .def("dtype", [](Tensor &self) { return ToDataType(self.type()); }); + .def("_set_float_element", TensorSetElement) + .def("_get_float_element", TensorGetElement) + .def("_set_double_element", TensorSetElement) + .def("_get_double_element", TensorGetElement) + .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); py::class_(m, "LoDTensor") .def_buffer( @@ -302,7 +311,8 @@ All parameter, weight, gradient are variables in Paddle. ::paddle::operators::reader::LoDTensorBlockingQueue; using LoDTensorBlockingQueueHolder = ::paddle::operators::reader::LoDTensorBlockingQueueHolder; - py::class_(m, "LoDTensorBlockingQueue", "") + py::class_>( + m, "LoDTensorBlockingQueue", "") .def("push", [](LoDTensorBlockingQueue &self, const std::vector &lod_tensor_vec) { @@ -317,7 +327,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_lod_tensor_blocking_queue", [](Variable &var, size_t capacity, const std::vector> &shapes) - -> LoDTensorBlockingQueue * { + -> std::shared_ptr { std::vector dims(shapes.size()); std::transform(shapes.begin(), shapes.end(), dims.begin(), [](const std::vector &shape) { @@ -325,9 +335,9 @@ All parameter, weight, gradient are variables in Paddle. }); auto *holder = var.GetMutable(); holder->InitOnce(capacity, dims); - return holder->GetQueue().get(); + return holder->GetQueue(); }, - py::return_value_policy::reference); + py::return_value_policy::copy); py::class_(m, "Scope", "") .def("var", @@ -508,6 +518,7 @@ All parameter, weight, gradient are variables in Paddle. [](bool init_p2p) { framework::InitDevices(init_p2p); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_dist", IsCompiledWithDIST); #ifdef PADDLE_WITH_CUDA m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 @@ -534,6 +545,8 @@ All parameter, weight, gradient are variables in Paddle. }); py::class_(m, "LoDTensorArray") + .def("__init__", + [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); }) .def("__getitem__", [](LoDTensorArray &self, size_t i) { return &self.at(i); }, py::return_value_policy::reference) @@ -656,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) + .def("bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/paddle/legacy/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp index 87fac3d6c6abe37b128213d4ffd66f8c1573a910..0ce1770c76c2e145d0b2bf71332cc4593517f195 100644 --- a/paddle/legacy/capi/Arguments.cpp +++ b/paddle/legacy/capi/Arguments.cpp @@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args, return kPD_NO_ERROR; } +PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args, + uint64_t ID, + paddle_matrix mat) { + if (args == nullptr || mat == nullptr) return kPD_NULLPTR; + auto m = paddle::capi::cast(mat); + auto a = castArg(args); + if (ID >= a->args.size()) return kPD_OUT_OF_RANGE; + m->mat = a->args[ID].in; + return kPD_NO_ERROR; +} + paddle_error paddle_arguments_get_ids(paddle_arguments args, uint64_t ID, paddle_ivector ids) { diff --git a/paddle/legacy/capi/arguments.h b/paddle/legacy/capi/arguments.h index 69a66bb012c318bc8317c246d690a7f4baffd248..ceb64ee6aa74a8ba4b5cb9045b366dcda8f8cc90 100644 --- a/paddle/legacy/capi/arguments.h +++ b/paddle/legacy/capi/arguments.h @@ -87,6 +87,18 @@ PD_API paddle_error paddle_arguments_get_value(paddle_arguments args, uint64_t ID, paddle_matrix mat); +/** + * @brief paddle_arguments_get_prob Get the prob matrix of beam search, which + * slot ID is `ID` + * @param [in] args arguments array + * @param [in] ID array index + * @param [out] mat matrix pointer + * @return paddle_error + */ +PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args, + uint64_t ID, + paddle_matrix mat); + /** * @brief PDArgsGetIds Get the integer vector of one argument in array, which * index is `ID`. diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh deleted file mode 120000 index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000 --- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh +++ /dev/null @@ -1 +0,0 @@ -../dense/convert_protobin.sh \ No newline at end of file diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh new file mode 100644 index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3 --- /dev/null +++ b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh @@ -0,0 +1 @@ +../dense/convert_protobin.sh diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh deleted file mode 120000 index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000 --- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh +++ /dev/null @@ -1 +0,0 @@ -../dense/convert_protobin.sh \ No newline at end of file diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh new file mode 100644 index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3 --- /dev/null +++ b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh @@ -0,0 +1 @@ +../dense/convert_protobin.sh diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh deleted file mode 120000 index 3c1b3533523cf1709720d11df7b8e311e0577fe7..0000000000000000000000000000000000000000 --- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh +++ /dev/null @@ -1 +0,0 @@ -../dense/convert_protobin.sh \ No newline at end of file diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh new file mode 100644 index 0000000000000000000000000000000000000000..b29f2cd21418ecbd2fb2ba626138e5aa11bf77f3 --- /dev/null +++ b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh @@ -0,0 +1 @@ +../dense/convert_protobin.sh diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp index 7faeff55c28b9065179ad27b3b604a9f411249e5..21ed049c4d2743d1fa914d6948d6c8c2862f0bfc 100644 --- a/paddle/legacy/utils/PythonUtil.cpp +++ b/paddle/legacy/utils/PythonUtil.cpp @@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName, const std::string& funcName, const std::vector& args) { PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args); +#if PY_MAJOR_VERSION >= 3 + Py_ssize_t str_size = 0u; + const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size); + return std::string(str, (size_t)str_size); +#else return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); +#endif // PY_MAJOR_VERSION >= 3 } PyObjectPtr createPythonClass( diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h index b0c8612c378fbe12cdf24e51a5b6546740b2d4c8..d5b2dbddde21f5c2a0696aadeda2b057175fc5e9 100644 --- a/paddle/legacy/utils/PythonUtil.h +++ b/paddle/legacy/utils/PythonUtil.h @@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName, namespace py { PyObjectPtr import(const std::string& moduleName); +#if PY_MAJOR_VERSION >= 3 +/** + * Cast a PyLong to int type T. + * @tparam T return type. + * @param [in] obj PyLong object. + * @param [out] ok status for casting. False if error occured. nullptr if user + * don't care is ok or not. + * @return The value of python object, or 0 if not ok. + */ +template +T castInt(PyObject* obj, bool* ok = nullptr) { + // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object + // were unified to long since python3 + if (PyLong_Check(obj)) { + if (ok) *ok = true; + return (T)PyLong_AsUnsignedLong(obj); + } else { + if (ok) *ok = false; + return (T)0; + } +} + +// Convert PyAPI from 2.x to 3.x +#define PyString_FromString PyUnicode_FromString +#define PyString_AsString PyUnicode_AsUTF8 + +#else /** * Cast a PyLong or PyInt to int type T. * @tparam T return type. @@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) { return (T)0; } } +#endif // PY_MAJOR_VERSION >= 3 /** * Invoke repr of python object. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d173b41e86f61954954b6a5ea9957d2e172deca0..c9f74127457e422ec4adab23b6d333e72250faf4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -19,6 +19,8 @@ # Utils #================================================= +set -ex + function print_usage() { echo -e "\n${RED}Usage${NONE}: ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]" @@ -37,6 +39,7 @@ function print_usage() { ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}check_style${NONE}: run code style check ${BLUE}cicheck${NONE}: run CI tasks + ${BLUE}assert_api_not_changed${NONE}: check api compability " } @@ -78,6 +81,12 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + elif [ "$1" == "cp35-cp35m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" fi fi @@ -108,6 +117,7 @@ function cmake_gen() { -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DPY_VERSION=${PY_VERSION:-2.7} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -136,7 +146,8 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \ + -DPY_VERSION=${PY_VERSION:-2.7} } function abort(){ @@ -318,11 +329,22 @@ function assert_api_not_changed() { virtualenv .env source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl - curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \ - > origin.spec python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec - python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec + python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate + + API_CHANGE=`git diff --name-only HEAD^ | grep "paddle/fluid/API.spec" || true` + echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" + if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then + # TODO: curl -H 'Authorization: token ${TOKEN}' + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have at least 2 approvals for the api change!" + exit 1 + fi + fi } @@ -508,15 +530,27 @@ function gen_fluid_inference_lib() { Deploying fluid inference library ... ======================================== EOF + cmake .. -DWITH_DISTRIBUTE=OFF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build - mv fluid_install_dir fluid + cp -r fluid_install_dir fluid tar -cf fluid.tgz fluid fi } +function test_fluid_inference_lib() { + if [ ${WITH_C_API:-OFF} == "OFF" ] ; then + cat <> sys.stderr, err_msg + return func(*args, **kwargs) + + wrapper.__doc__ += "\n " + wrapper.__doc__ += err_msg + return wrapper + + return decorator diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 4faa06303170488d0de2fda4c1461cfe2d623d35..812f68bdd849544456b2e0ebf0b739f4f92b09ea 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -18,10 +18,7 @@ import collections import copy import unique_name -__all__ = [ - 'append_backward', - 'calc_gradient', -] +__all__ = ['append_backward'] def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): @@ -123,7 +120,8 @@ def _append_grad_suffix_(name): def _addup_repetitive_outputs_(op_descs): """ In backward part, an variable may be the output of more than one ops. - In this case, the variable should be the accumulation of all the outputs. + And one op may yield its multiple outputs to the same variable. + In these cases, the variable should be the accumulation of all the outputs. `sum_op`s are added to implement the accumulate. """ pending_sum_ops = [] @@ -136,29 +134,46 @@ def _addup_repetitive_outputs_(op_descs): "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]}, {"use_mkldnn": False}), idx)) renamed_vars[var_name] = [var_name] - for var_name in op_desc.output_arg_names(): - if var_name == core.empty_var_name( - ) or var_name in op_desc.input_arg_names(): - # empty variable or inplace op - continue - if len(renamed_vars[var_name]) == 0: - # it's the first time we get the variable - renamed_vars[var_name] = [var_name] - else: - if len(renamed_vars[var_name]) == 1: + for param_idx, param_name in enumerate(op_desc.output_names()): + arg_names = op_desc.output(param_name) + for arg_idx, var_name in enumerate(arg_names): + if var_name == core.empty_var_name( + ) or var_name in op_desc.input_arg_names(): + # empty variable or inplace op + continue + if len(renamed_vars[var_name]) == 0: + # it's the first time we get the variable + renamed_vars[var_name] = [var_name] + else: + if len(renamed_vars[var_name]) == 1: + new_name = var_name + "@RENAME@" + \ + str(var_rename_count[var_name]) + var_rename_count[var_name] += 1 + # rename original var_name + renamed_vars[var_name][0] = new_name + _rename_arg_(op_descs, var_name, new_name, 0, idx) + _rename_arg_(pending_sum_ops, var_name, new_name) + + for p in op_desc.output_names()[:param_idx]: + p_arg_names = op_desc.output(p) + if var_name in p_arg_names: + op_desc.set_output(p, [ + new_name if x == var_name else x + for x in p_arg_names + ]) + + arg_names = [ + new_name if x == var_name else x + for x in arg_names[:arg_idx] + ] + arg_names[arg_idx:] + new_name = var_name + "@RENAME@" + \ str(var_rename_count[var_name]) var_rename_count[var_name] += 1 - # rename original var_name - renamed_vars[var_name][0] = new_name - _rename_arg_(op_descs, var_name, new_name, 0, idx) - _rename_arg_(pending_sum_ops, var_name, new_name) - - new_name = var_name + "@RENAME@" + \ - str(var_rename_count[var_name]) - var_rename_count[var_name] += 1 - op_desc.rename_output(var_name, new_name) - renamed_vars[var_name].append(new_name) + arg_names[arg_idx] = new_name + op_desc.set_output(param_name, arg_names) + renamed_vars[var_name].append(new_name) + for var_name, inputs in renamed_vars.iteritems(): if len(inputs) > 1: pending_sum_ops.append( @@ -313,7 +328,7 @@ def _append_backward_ops_(block, if op.has_attr("sub_block"): sub_block = program.block(op.block_attr("sub_block")) grad_sub_block = program.create_block() - grad_sub_block.set_forward_block_idx(sub_block.idx) + grad_sub_block._set_forward_block_idx(sub_block.idx) cb = _callback_lookup_(op) if cb is not None: if callbacks is None: @@ -556,7 +571,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) program.current_block_idx = current_block_idx - program.sync_with_cpp() + program._sync_with_cpp() # FIXME(zcd): prevent loss.grad optimized by mem_opt. loss.block.var(_append_grad_suffix_(loss.name)).persistable = True @@ -729,7 +744,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map) _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map) - prog.sync_with_cpp() + prog._sync_with_cpp() grad_vars = [] for input_var in inputs: diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 18e2f3045e272fb4712391f87bffd3f367c1c744..c029662ebc1b7e7f7d1ea44b4ebd4b08b812a579 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -31,7 +31,7 @@ class BaseErrorClipAttr(object): def __str__(self): raise NotImplementedError() - def append_clip_op(self, block, grad_name): + def _append_clip_op(self, block, grad_name): raise NotImplementedError() @@ -67,7 +67,7 @@ class ErrorClipByValue(BaseErrorClipAttr): def __str__(self): return "ByValue, min=%f, max=%f" % (self.min, self.max) - def append_clip_op(self, block, grad_name): + def _append_clip_op(self, block, grad_name): clip_op_desc = block.desc.append_op() clip_op_desc.set_type("clip") clip_op_desc.set_input("X", [grad_name]) @@ -82,7 +82,7 @@ def error_clip_callback(block, context): op_desc = block.desc.op(block.desc.op_size() - 1) for grad_n in filter(lambda n: grad_to_var.has_key(n), op_desc.output_arg_names()): - fwd_var = block.var_recursive(grad_to_var[grad_n]) + fwd_var = block._var_recursive(grad_to_var[grad_n]) error_clip = getattr(fwd_var, "error_clip", None) if not (error_clip is None or isinstance(error_clip, BaseErrorClipAttr)): @@ -90,17 +90,17 @@ def error_clip_callback(block, context): "Variable's error_clip should be an instance of BaseErrorClipAttr or None." ) if error_clip is not None: - error_clip.append_clip_op(block, grad_n) + error_clip._append_clip_op(block, grad_n) class BaseGradientClipAttr(object): def __str__(self): raise NotImplementedError() - def process_context(self, context, param, grad): + def _process_context(self, context, param, grad): raise NotImplementedError() - def create_operators(self, param, grad): + def _create_operators(self, param, grad): raise NotImplementedError() @@ -108,10 +108,10 @@ class NullGradientClipAttr(BaseGradientClipAttr): def __str__(self): return "Null" - def process_context(self, context, param, grad): + def _process_context(self, context, param, grad): pass - def create_operators(self, param, grad): + def _create_operators(self, param, grad): return param, grad @@ -153,10 +153,10 @@ class GradientClipByValue(BaseGradientClipAttr): def __str__(self): return "ByValue, min=%f, max=%f" % (self.min, self.max) - def process_context(self, context, param, grad): + def _process_context(self, context, param, grad): pass - def create_operators(self, param, grad): + def _create_operators(self, param, grad): new_grad = layers.clip(x=grad, min=self.min, max=self.max) return param, new_grad @@ -199,10 +199,10 @@ class GradientClipByNorm(BaseGradientClipAttr): def __str__(self): return "ByNorm, clip_norm=%f" % self.clip_norm - def process_context(self, context, param, grad): + def _process_context(self, context, param, grad): pass - def create_operators(self, param, grad): + def _create_operators(self, param, grad): new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm) return param, new_grad @@ -257,7 +257,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name, self.clip_norm) - def process_context(self, context, param, grad): + def _process_context(self, context, param, grad): if self.group_name not in context: context[self.group_name] = [] context[self.group_name + "_clip_value"] = self.clip_norm @@ -274,7 +274,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): self.context = context - def create_operators(self, param, grad): + def _create_operators(self, param, grad): group_scale_name = self.group_name + "_scale" if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) @@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None): param.gradient_clip_attr = copy.deepcopy(clip) -def append_gradient_clip_ops(param_grad): +def append_gradient_clip_ops(param_grads): context = dict() - for p, g in param_grad: - with p.block.program.optimized_guard(p): + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) if clip_attr is None: clip_attr = NullGradientClipAttr() @@ -336,12 +338,14 @@ def append_gradient_clip_ops(param_grad): "clip attribute should be an instance of BaseGradientClipAttr" ) - clip_attr.process_context(context=context, param=p, grad=g) + clip_attr._process_context(context=context, param=p, grad=g) res = [] - for p, g in param_grad: - with p.block.program.optimized_guard(p): - res.append(clip_attr.create_operators(param=p, grad=g)) + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): + res.append(clip_attr._create_operators(param=p, grad=g)) return res diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index 470dd0df524936a773f6e740c8079f0efa8ef7b4..b8fe9bd4c1988dd3f6fa82df391c3059dfbfcf93 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -69,8 +69,10 @@ class Go(BlockGuard): parent_block.append_op( type='go', inputs={ - 'X': - [parent_block.var_recursive(x_name) for x_name in x_name_list] + 'X': [ + parent_block._var_recursive(x_name) + for x_name in x_name_list + ] }, outputs={}, attrs={'sub_block': go_block}) @@ -259,7 +261,7 @@ class Select(BlockGuard): if var_name in intermediate ] - X = [select_block.var_recursive(x_name) for x_name in params] + X = [select_block._var_recursive(x_name) for x_name in params] # Needs to be used by `equal` inside the cases block. X.append(self.case_to_execute) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index b436dfe70afdb52299222f8ba3f5bdff2842d103..f9e600cb4cb252baead87025db0e0db71e8169d2 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -309,7 +309,7 @@ class Executor(object): if not has_feed_operators(global_block, feed, feed_var_name): for i, name in enumerate(feed): out = global_block.var(name) - global_block.prepend_op( + global_block._prepend_op( type='feed', inputs={'X': [feed_var]}, outputs={'Out': [out]}, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ea3117e02bd993b06de39725b2c3296031065e3c..03e0ac757586150610aee275620d9eee77323c99 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -32,7 +32,6 @@ except Exception, e: import unique_name __all__ = [ - 'Block', 'Variable', 'Program', 'Operator', @@ -447,7 +446,7 @@ class Operator(object): Notes: The constructor of operator should not be invoked directly. Use - Block.append_op or Block.prepend_op instead. + Block.append_op or Block._prepend_op instead. Examples: .. code-block:: python @@ -870,7 +869,7 @@ class Block(object): def forward_block_idx(self): return self.desc.get_forward_block_idx() - def set_forward_block_idx(self, idx): + def _set_forward_block_idx(self, idx): """ Set the forward block Idx. @@ -880,7 +879,7 @@ class Block(object): Returns: None """ - self.desc.set_forward_block_idx(idx) + self.desc._set_forward_block_idx(idx) @property def idx(self): @@ -909,7 +908,7 @@ class Block(object): raise ValueError("var %s not in this block" % name) return v - def var_recursive(self, name): + def _var_recursive(self, name): """ Get a Variable by name from this block recursively. @@ -951,9 +950,9 @@ class Block(object): raise ValueError("Var {0} is not found recursively".format(name)) def all_parameters(self): - return list(self.iter_parameters()) + return list(self._iter_parameters()) - def iter_parameters(self): + def _iter_parameters(self): return (item[1] for item in self.vars.iteritems() if isinstance(item[1], Parameter)) @@ -966,7 +965,7 @@ class Block(object): def has_var(self, name): return name in self.vars - def rename_var(self, name, new_name): + def _rename_var(self, name, new_name): """ Rename variable in vars and ops' inputs and outputs @@ -1000,8 +999,8 @@ class Block(object): else: raise ValueError("unsupported var type: %s", type(v)) orig_var_type = v.type - self.desc.rename_var(name, new_name) - # NOTE: v is destroyed by C++ after calling rename_var. + self.desc._rename_var(name, new_name) + # NOTE: v is destroyed by C++ after calling _rename_var. d = self.desc.find_var(new_name) if var_type == "Parameter": var = Parameter( @@ -1024,16 +1023,16 @@ class Block(object): error_clip=error_clip, stop_gradient=stop_gradient) - # rename the python side, sync_with_cpp will only add + # rename the python side, _sync_with_cpp will only add # new vars/ops to python side. self.vars[new_name] = var del self.vars[name] - self.sync_with_cpp() + self._sync_with_cpp() return var - def remove_var(self, name): - self.sync_with_cpp() - self.desc.remove_var(name) + def _remove_var(self, name): + self._sync_with_cpp() + self.desc._remove_var(name) del self.vars[name] def create_parameter(self, *args, **kwargs): @@ -1055,7 +1054,7 @@ class Block(object): self.ops.append(op) return op - def insert_op(self, index, *args, **kwargs): + def _insert_op(self, index, *args, **kwargs): """ Insert a Operator according to the giving arguments. @@ -1065,13 +1064,13 @@ class Block(object): Returns: Operator: the insert Operator. """ - self.sync_with_cpp() - op_desc = self.desc.insert_op(index) + self._sync_with_cpp() + op_desc = self.desc._insert_op(index) op = Operator(block=self, desc=op_desc, *args, **kwargs) self.ops.insert(index, op) return op - def remove_op(self, index): + def _remove_op(self, index): """ Remove the specific position operator. @@ -1081,11 +1080,11 @@ class Block(object): Returns: None """ - self.sync_with_cpp() - self.desc.remove_op(index, index + 1) + self._sync_with_cpp() + self.desc._remove_op(index, index + 1) del self.ops[index] - def slice_ops(self, start, end): + def _slice_ops(self, start, end): """ Return the Operator between start and end. @@ -1098,13 +1097,13 @@ class Block(object): """ return self.ops[start:end] - def prepend_op(self, *args, **kwargs): - op_desc = self.desc.prepend_op() + def _prepend_op(self, *args, **kwargs): + op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) self.ops.insert(0, op) return op - def sync_with_cpp(self): + def _sync_with_cpp(self): """ Sync from the desc on the c++ end. This method is used to synchronize the c++ desc instance generated by backward. @@ -1170,7 +1169,7 @@ class Block(object): for index in range(len(self.ops)): assert self.ops[index].desc == ops_in_cpp[index] - def copy_param_info_from(self, other): + def _copy_param_info_from(self, other): """ Copy the information of parameters from the other block. @@ -1185,12 +1184,13 @@ class Block(object): None """ if not isinstance(other, Block): - raise TypeError("copy_param_info_from should be invoked with Block") - for p in other.iter_parameters(): + raise TypeError( + "_copy_param_info_from should be invoked with Block") + for p in other._iter_parameters(): assert isinstance(p, Parameter) v = self.vars.get(p.name, None) if v is None: - raise ValueError("copy_param_info_from should be invoked with " + raise ValueError("_copy_param_info_from should be invoked with " "same topology") assert isinstance(v, Variable) new_p = Parameter( @@ -1208,7 +1208,7 @@ class Block(object): name=v.name) self.vars[new_p.name] = new_p - def clone_variable(self, var): + def _clone_variable(self, var): """ Clone a variable into current block. @@ -1319,7 +1319,7 @@ class Program(object): self._op_role_var = [var_name] @contextlib.contextmanager - def optimized_guard(self, var): + def optimized_guard(self, param_and_grads): """ A with guard to set :code:`Optimization` :code:`OpRole` and :code:`OpRoleVar` automatically. @@ -1327,17 +1327,20 @@ class Program(object): Notes: This is a very low level API. Users should not use it directly. Args: - var(Variable|str): The variable (name) to be optimized. + param_and_grads(list): The variables (names) to be optimized. Examples: >>> p, g = backward(...) - >>> with program.optimized_guard(p): + >>> with program.optimized_guard([p,g]): >>> p = p - 0.001 * g """ OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize - self._op_role_var = [var.name if isinstance(var, Variable) else var] + self._op_role_var = [ + var.name if isinstance(var, Variable) else var + for var in param_and_grads + ] yield self._op_role_var = [] self._current_role = OpRole.Forward @@ -1481,9 +1484,9 @@ class Program(object): p = Program() p.desc = core.ProgramDesc(self.desc) p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] - p.sync_with_cpp() + p._sync_with_cpp() - p.copy_param_info_from(self) + p._copy_param_info_from(self) p.copy_data_info_from(self) return p @@ -1533,7 +1536,7 @@ class Program(object): res = Program() res.desc = core.prune(self.desc, targets_idx) res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] - res.sync_with_cpp() + res._sync_with_cpp() return res def inference_optimize(self): @@ -1559,7 +1562,7 @@ class Program(object): if op.has_attr('is_test'): op.set_attr('is_test', True) res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] - res.sync_with_cpp() + res._sync_with_cpp() return res @staticmethod @@ -1579,7 +1582,7 @@ class Program(object): p = Program() p.desc = core.ProgramDesc(binary_str) p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())] - p.sync_with_cpp() + p._sync_with_cpp() return p @property @@ -1659,7 +1662,7 @@ class Program(object): """ self.current_block_idx = self.current_block().parent_idx - def sync_with_cpp(self): + def _sync_with_cpp(self): """ Synchronize Python instance to its binding C++ object instance. If the program is modified in C++ space, this method should be invoked. @@ -1673,9 +1676,9 @@ class Program(object): for block_idx in range(len(self.blocks), self.desc.num_blocks()): self.blocks.append(Block(self, block_idx)) for block in self.blocks: - block.sync_with_cpp() + block._sync_with_cpp() - def copy_param_info_from(self, other): + def _copy_param_info_from(self, other): """ Copy the information of parameters from other program. @@ -1689,13 +1692,13 @@ class Program(object): None """ if not isinstance(other, Program): - raise TypeError("copy_param_info_from should be invoked with " + raise TypeError("_copy_param_info_from should be invoked with " "Program") if len(self.blocks) != len(other.blocks): - raise ValueError("copy_param_info_from should be invoked with two " + raise ValueError("_copy_param_info_from should be invoked with two " "program, with represent the same topology") - self.global_block().copy_param_info_from(other.global_block()) + self.global_block()._copy_param_info_from(other.global_block()) def copy_data_info_from(self, other): """ @@ -1711,11 +1714,11 @@ class Program(object): None """ if not isinstance(other, Program): - raise TypeError("copy_param_info_from should be invoked with " + raise TypeError("_copy_param_info_from should be invoked with " "Program") if len(self.blocks) != len(other.blocks): - raise ValueError("copy_param_info_from should be invoked with two " + raise ValueError("_copy_param_info_from should be invoked with two " "program, with represent the same topology") for var in other.global_block().vars.itervalues(): if var.is_data: diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 373e9c060de1ee27c165ccd2380cd8c38612c4d9..0e640bf280d396504deec1183821da3e8a156530 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -148,7 +148,7 @@ class ConstantInitializer(Initializer): assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) # Initialization Ops should be prepended and not appended - op = block.prepend_op( + op = block._prepend_op( type="fill_constant", outputs={"Out": var}, attrs={ @@ -202,7 +202,7 @@ class UniformInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed - op = block.prepend_op( + op = block._prepend_op( type="uniform_random", outputs={"Out": var}, attrs={ @@ -256,7 +256,7 @@ class NormalInitializer(Initializer): # Initialization Ops should be prepended and not appended if self._seed == 0: self._seed = block.program.random_seed - op = block.prepend_op( + op = block._prepend_op( type="gaussian_random", outputs={"Out": var}, attrs={ @@ -346,7 +346,7 @@ class XavierInitializer(Initializer): if self._uniform: limit = np.sqrt(6.0 / float(fan_in + fan_out)) - op = block.prepend_op( + op = block._prepend_op( type="uniform_random", outputs={"Out": var}, attrs={ @@ -359,7 +359,7 @@ class XavierInitializer(Initializer): else: std = np.sqrt(2.0 / float(fan_in + fan_out)) - op = block.prepend_op( + op = block._prepend_op( type="gaussian_random", outputs={"Out": var}, attrs={ @@ -444,7 +444,7 @@ class MSRAInitializer(Initializer): if self._uniform: limit = np.sqrt(6.0 / float(fan_in)) - op = block.prepend_op( + op = block._prepend_op( type="uniform_random", outputs={"Out": var}, attrs={ @@ -457,7 +457,7 @@ class MSRAInitializer(Initializer): else: std = np.sqrt(2.0 / float(fan_in)) - op = block.prepend_op( + op = block._prepend_op( type="gaussian_random", outputs={"Out": var}, attrs={ diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 5c8f4f6507c7dd9b3d005639d962ce1e55b2c704..cf43998228a06d73f4d7d6dfc85dcd002078ba0f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -24,10 +24,7 @@ from . import core __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'load_persistables', 'save_inference_model', 'load_inference_model', - 'get_inference_program', 'save_checkpoint', 'load_checkpoint', - 'clean_checkpoint', 'load_persist_vars_without_grad', - 'load_lookup_table_vars', 'save_persist_vars_without_grad', - 'get_latest_checkpoint_serial' + 'get_inference_program' ] @@ -526,7 +523,7 @@ def prepend_feed_ops(inference_program, for i, name in enumerate(feed_target_names): out = global_block.var(name) - global_block.prepend_op( + global_block._prepend_op( type='feed', inputs={'X': [feed_var]}, outputs={'Out': [out]}, @@ -628,7 +625,7 @@ def save_inference_model(dirname, for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) if op.type == "feed" or op.type == "fetch": - global_block.remove_op(i) + global_block._remove_op(i) copy_program.desc.flush() pruned_program = copy_program.prune(targets=target_vars) @@ -794,588 +791,6 @@ def get_parameter_value_by_name(name, executor, program=None): return get_parameter_value(var, executor) -SUCCESS_MARK_FILENAME = "_SUCCESS" -CHECKPOINT_PREFIX = "checkpoint" -MODEL_DIR = "__model__" -LOOKUP_TABLE_DIR = "__lookup_table__" -TRAINER_PREFIX = "trainer" -CHECKPOINT_SEPARATOR = "_" - - -def save_checkpoint(executor, - checkpoint_dir, - trainer_id, - trainer_args=None, - main_program=None, - max_num_checkpoints=3, - lookup_table=None, - ps_endpoint_list=None): - """ - This function filters out all checkpoint variables from the give - main_program and then saves these variables to the `checkpoint_dir` - directory. - - In the training precess, we generally save a checkpoint in each - iteration. So there might be a lot of checkpoints in the - `checkpoint_dir`. To avoid them taking too much disk space, the - `max_num_checkpoints` are introduced to limit the total number of - checkpoints. If the number of existing checkpints is greater than - the `max_num_checkpoints`, oldest ones will be scroll deleted. - - A variable is a checkpoint variable and will be saved if it meets - all following conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for save checkpoint. - checkpoint_dir(str): The folder where to save checkpoints. - trainer_id(int): currect trainer id, if id is equal to 0, the trainer - is chief. - trainer_args(dict|None): Current training arguments. Such as 'epoch_id' - and 'step_id'. - Defaut: None - main_program(Program|None): The program whose checkpoint variables will - be saved. If it is None, the default main program will be used. - max_num_checkpoints(int): The max number of total number of existing - checkpoints. - Default: 3 - lookup_table(string|None): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - ps_endpoint_list(list|None): the parameter server ip:port list. - when use distribute lookup table, we can get ps_endpoint_list by - distribute arguments. - - Returns: - None - - Raises: - ValueError: If `checkpoint_dir` is None. - AssertionError: If `trainer_args` is not a dict. - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - trainer_args = {"epoch_id": 200, - "step_id": 20} # just an example - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - fluid.io.save_checkpoint(executor=exe, - checkpoint_dir=path, - trainer_id=0, - trainer_args=trainer_args, - main_program=prog, - max_num_checkpoints=3, - lookup_table=table_name, - ps_endpoint_list = ps_endpoints) - """ - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - assert checkpoint_dir - - if trainer_args: - assert isinstance(trainer_args, dict) - - is_chief = trainer_id == 0 - - _make_chekcpoint_dirs(checkpoint_dir) - serial = get_latest_checkpoint_serial(checkpoint_dir) + 1 - cur_dir = _get_serial_dir(checkpoint_dir, serial) - - save_trainer_args(cur_dir, trainer_id, trainer_args) - - if is_chief: - save_persist_vars_without_grad(executor, cur_dir, main_program) - - if is_chief and lookup_table and ps_endpoint_list: - save_pserver_vars_by_notify(executor, cur_dir, lookup_table, - ps_endpoint_list) - - _scroll_delete(checkpoint_dir, max_num_checkpoints) - - -def load_checkpoint(executor, checkpoint_dir, serial, main_program): - """ - This function filters out all checkpoint variables from the give - main_program and then try to load these variables from the - `checkpoint_dir` directory. - - In the training precess, we generally save a checkpoint in each - iteration. So there are more than one checkpoint in the - `checkpoint_dir` (each checkpoint has its own sub folder), use - `serial` to specify which serial of checkpoint you would like to - load. - - A variable is a checkpoint variable and will be loaded if it meets - all following conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for loading checkpoint. - checkpoint_dir(str): The folder where all checkpoints are. - serial(int): The serial of checkpoint you would like to load. - main_program(Program): The program whose checkpoint variables will - be loaded. - - Returns: - None - - Raises: - ValueError: If `checkpoint_dir` is None. - ValueError: If `serial` is None or `serial` is less than 0. - ValueError: If `main_program` is None. - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - path = "./checkpoints" - prog = fluid.default_main_program() - fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path, - serial=9, main_program=prog) - - # In this example, `load_checkpoint` function - # will first filters out all checkpoint variables in the default - # main program, and then try to load these variables form the - # folder "./checkpoints/checkpoint_9/__model__". - """ - - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - - if serial is None or serial < 0: - raise ValueError("'serial' should not be None or <0 ") - - if main_program is None: - raise ValueError('main_program should not be None.') - - cur_dir = _get_serial_dir(checkpoint_dir, serial) - load_persist_vars_without_grad(executor, cur_dir, main_program, True) - - -def clean_checkpoint(checkpoint_dir, delete_dir=False): - """ - clean the checkpoint dir, when the train exits normally, - the trainer will call clean_checkpoint to delete checkpoint directory saved before. - delete_dir only works when the directory is empty, otherwise, OSError is raised. - - : param checkpoint_dir - : param delete_dir - """ - - if checkpoint_dir is None: - raise ValueError("'checkpoint_dir' should not be None") - _scroll_delete(checkpoint_dir, max_num_checkpoints=0) - - if delete_dir and not os.listdir(checkpoint_dir): - os.rmdir(checkpoint_dir) - - -def load_persist_vars_without_grad(executor, - dirname, - program, - has_model_dir=False): - """ - This function filters out all checkpoint variables from the give - program and then trys to load these variables from the given directory. - - A variable is a checkpoint variable if it meets all following - conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for loading variables. - dirname(str): The directory path. - program(Program): The program whose checkpoint variables will - be loaded. - has_model_dir(bool): if True, the function loads variables - from a sub directory named '__model__'. - Default: False - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - fluid.io.load_persist_vars_without_grad(executor=exe, - dirname=param_path, program=prog, has_model_dir=True) - - # In this example, `load_persist_vars_without_grad` function - # will first filters out all checkpoint variables in the default - # main program, and then trys to load these variables form the - # folder "./my_paddle_model/__model__". - """ - - if has_model_dir: - dirname = _get_model_dir(dirname) - - load_vars( - executor, - dirname=dirname, - main_program=program, - predicate=_is_checkpoint_var, - filename=None) - - -def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): - """ - The parameter server will load lookup table's local file in - selectedrows variable. - - Args: - executor(Executor): The executor to run for loading persistable variables - dirname(str): The directory path - main_program(Program): Find the variable named table_name in main_program - pserver_id(int): the serial number in pserver_endpoints list - table_name(str): lookup table name - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - dirname = "./checkpoints/checkpoint_9/__model__" - prog = fluid.default_main_program() - pserver_id = 1 - table_name = "share_w" - fluid.io.load_lookup_table_vars(executor=exe, - dirname=dirname, program=prog, pserver_id=pserver_id, - table_name=table_name) - """ - - for var in program.list_vars(): - if var.name == table_name: - lookup_table_var = var - break - - assert lookup_table_var is not None - - lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) - table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) - - load_prog = Program() - load_block = load_prog.global_block() - - load_block.append_op( - type='load', - inputs={}, - outputs={'Out': [lookup_table_var]}, - attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) - - executor.run(load_prog) - - -def save_persist_vars_without_grad(executor, dirname, program): - """ - This function filters out all checkpoint variables from the give - program and then save these variables to a sub-folder '__model__' of - the given directory. - - A variable is a checkpoint variable if it meets all following - conditions: - 1. It's persistable. - 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. - 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". - - Args: - executor(Executor): The executor to run for saving variables. - dirname(str): The directory path. - program(Program): The program whose checkpoint variables will - be saved. - - Returns: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - fluid.io.save_persist_vars_without_grad(executor=exe, - dirname=param_path, program=prog) - - # In this example, `save_persist_vars_without_grad` function - # will first filters out all checkpoint variables in the default - # main program, and then saves these variables to the folder - # "./my_paddle_model/__model__". - """ - cur_dir = _get_model_dir(dirname) - save_vars( - executor, - dirname=cur_dir, - main_program=program, - vars=None, - predicate=_is_checkpoint_var, - filename=None) - _write_success(cur_dir) - - -def save_pserver_vars_by_notify(executor, dirname, lookup_table, - ps_endpoint_list): - """ - This function will send checkpoint notify message from Trainer 0 - to all the pservers. - The checkpoint notify message contains lookup table name, - the absolute path on pserver to save lookup_table. - - Args: - executor(Executor): The executor to run for send checkpoint notify. - dirname(str): The folder where to save checkpoints. - lookup_table(string): the lookup table name, when use distribute - lookup table, we can get lookup table name by DistributeTranspiler. - table_name - ps_endpoint_list(list): the parameter server ip:port list. - when use distribute lookup table, we can get ps_endpoint_list by - distribute arguments. - Return: - None - - Examples: - .. code-block:: python - - exe = fluid.Executor(fluid.CPUPlace()) - param_path = "./my_paddle_model" - prog = fluid.default_main_program() - table_name = "share_w" - ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] - - fluid.io.save_pserver_vars_by_notify(executor=exe, - dirname=param_path, lookup_table=table_name, - ps_endpoint_list=ps_endpoints) - """ - cur_dir = _get_lookuptable_dir(dirname) - - checkpoint_notify_program = Program() - checkpoint_notify_block = checkpoint_notify_program.global_block() - - attrs = {} - attrs['epmap'] = ps_endpoint_list - attrs['dir'] = cur_dir - attrs['lookup_table'] = lookup_table - - checkpoint_notify_block.append_op( - type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) - executor.run(checkpoint_notify_program) - - -def save_trainer_args(dirname, trainer_id, trainer_args): - assert isinstance(trainer_args, dict) - - cur_dir = _get_trainer_dir(dirname, trainer_id) - - for name, value in trainer_args.iteritems(): - args_file = os.path.join(cur_dir, name) - with open(args_file, 'w') as f: - f.write(str(value)) - _write_success(cur_dir) - - -def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): - """ - trainer will load some args from it's independent directory, - such as epoch_id and step_id. - - Args: - checkpoint_dir(str): The folder where all checkpoints are. - serial(int): The serial of checkpoint you would like to load. - trainer_id(int): current trainer id. - trainer_args(list): list about load trainer args - Return: - None - - Examples: - .. code-block:: python - - param_path = "./checkpoint/" - serial = 7 - trainer_id = 2 - trainer_args = ["epoch_id", "step_id"] - - fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial, - trainer_id=trainer_id, trainer_args=trainer_args) - """ - assert isinstance(trainer_args, list) - - cur_dir = _get_serial_dir(checkpoint_dir, serial) - cur_dir = _get_trainer_dir(cur_dir, trainer_id) - - ret_values = [] - - for arg in trainer_args: - cur_file = os.path.join(cur_dir, arg) - with open(cur_file, 'r') as f: - contents = f.read() - ret_values.append(contents.strip()) - return ret_values - - -def _is_checkpoint_var(var): - """ - the checkpoint will not save or load all the variables. - var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. - - : param var(Variable) - """ - if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ - var.desc.type() == core.VarDesc.VarType.RAW: - return False - # @GRAD are named for gradient variables, checkpoint will not save it. - if "@GRAD" in var.name: - return False - # .trainer_ are named for distribute train variables, checkpoint will not save it. - if ".trainer_" in var.name: - return False - - # .block is named for distribute train variables, checkpoint will not save it. - if ".block" in var.name: - return False - - return var.persistable - - -def _make_chekcpoint_dirs(dirs): - """ - _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it. - """ - assert dirs is not None - - if os.path.isfile(dirs): - raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs) - - if not os.path.isdir(dirs): - try: - os.makedirs(dirs) - except OSError as err: - if err.errno != errno.EEXIST: - raise err - - -def _get_dir_serial(dirname): - _, serial = dirname.split(CHECKPOINT_SEPARATOR) - - try: - serial_num = int(serial) - except ValueError: - serial_num = -1 - return serial_num - - -def _get_serial_dir(dirname, serial): - serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) - serial_dir = os.path.join(dirname, serial_folder) - _make_chekcpoint_dirs(serial_dir) - - return serial_dir - - -def _get_model_dir(dirname): - model_dir = os.path.join(dirname, MODEL_DIR) - _make_chekcpoint_dirs(model_dir) - return model_dir - - -def _get_lookuptable_dir(dirname): - lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) - _make_chekcpoint_dirs(lookuptable_dir) - return lookuptable_dir - - -def _get_trainer_dir(dirname, trainer_id): - trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) - trainer_dir = os.path.join(dirname, trainer_folder) - _make_chekcpoint_dirs(trainer_dir) - return trainer_dir - - -def _scroll_delete(dirname, max_num_checkpoints=3): - dirs = os.listdir(dirname) - serial_map = {} - for serial in dirs: - serial_num = _get_dir_serial(serial) - serial_map[serial_num] = serial - - if len(serial_map.keys()) <= max_num_checkpoints: - return - - serials = serial_map.keys() - serials.sort(reverse=True) - serials = serials[max_num_checkpoints:] - for serial in serials: - cur_dir = _get_serial_dir(dirname, serial) - try: - shutil.rmtree(cur_dir) - except OSError as err: - if err.errno != errno.ENOENT: - raise err - - -def _write_success(dirname): - """ - write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. - - : param dirname - """ - success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) - with open(success_file, 'a') as f: - now = time.ctime() - f.write(now) - - -def get_latest_checkpoint_serial(checkpoint_dir): - """ - get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory - - : param checkpoint_dir - """ - if not checkpoint_dir: - return -1 - - def has_success(checkpoint_dir, cur_dir): - """ - is _SUCCESS in this dir - """ - - serial = _get_dir_serial(cur_dir) - if serial == -1 or not os.path.isdir( - os.path.join(checkpoint_dir, cur_dir)): - return -1 - - success_path = os.path.join( - _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, - SUCCESS_MARK_FILENAME) - if os.path.isfile(success_path): - return serial - - if not os.path.isdir(checkpoint_dir): - return -1 - - current_dir = -1 - dirs = os.listdir(checkpoint_dir) - for cur_dir in dirs: - success_num = has_success(checkpoint_dir, cur_dir) - if success_num > current_dir: - current_dir = success_num - return current_dir - - def get_test_program(filelist, program=None, startup_program=None): """ Transpile current train program to a program to read test dataset @@ -1459,7 +874,7 @@ def get_test_program(filelist, program=None, startup_program=None): main_block = program.global_block() for var in main_block.vars.values(): if var.type == core.VarDesc.VarType.READER: - main_block.rename_var( + main_block._rename_var( str(var.name), str(_get_test_reader_name(var.name))) for op in main_block.ops: @@ -1468,7 +883,7 @@ def get_test_program(filelist, program=None, startup_program=None): if op.type == "create_multi_pass_reader": test_op.set_attr("pass_num", 1) - startup_program.sync_with_cpp() - program.sync_with_cpp() + startup_program._sync_with_cpp() + program._sync_with_cpp() return program diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 86efd1ff51cf29485ee28b4d60ffb1439af1aad9..de752d1daeb6bc725cf6eff1bb74a786e2ad6b95 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -68,11 +68,11 @@ class LayerHelper(object): @property def param_attr(self): - return ParamAttr.to_attr(self.kwargs.get('param_attr', None)) + return ParamAttr._to_attr(self.kwargs.get('param_attr', None)) @property def bias_attr(self): - return ParamAttr.to_attr(self.kwargs.get('bias_attr', None)) + return ParamAttr._to_attr(self.kwargs.get('bias_attr', None)) def multiple_param_attr(self, length): param_attr = self.param_attr @@ -262,11 +262,11 @@ class LayerHelper(object): g_param = self.startup_program.global_block().create_parameter( dtype=dtype, shape=g_param_shape, - **g_param_attr.to_kwargs(with_initializer=False)) + **g_param_attr._to_kwargs(with_initializer=False)) v_param = self.startup_program.global_block().create_parameter( dtype=dtype, shape=v_param_shape, - **v_param_attr.to_kwargs(with_initializer=True)) + **v_param_attr._to_kwargs(with_initializer=True)) __norm_except_dim( x=v_param, out=g_param, @@ -275,9 +275,9 @@ class LayerHelper(object): # Add weight normalization to main_program g_param = self.main_program.global_block().create_parameter( - dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs()) + dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs()) v_param = self.main_program.global_block().create_parameter( - dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs()) + dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs()) w_param = __weight_normalize(g_param, v_param, dim=attr.dim) return w_param @@ -296,11 +296,11 @@ class LayerHelper(object): if default_initializer is None and attr.initializer is None: if is_bias: - attr.set_default_bias_initializer() + attr._set_default_bias_initializer() else: - attr.set_default_param_initializer() + attr._set_default_param_initializer() else: - attr.set_default_initializer(default_initializer) + attr._set_default_initializer(default_initializer) # If weight normalization is set, insert extra parameters and ops. # Refer to https://arxiv.org/pdf/1602.07868.pdf @@ -310,9 +310,9 @@ class LayerHelper(object): return param self.startup_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True)) + dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) return self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr.to_kwargs()) + dtype=dtype, shape=shape, **attr._to_kwargs()) def get_parameter(self, name): param = self.main_program.global_block().var(name) diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250..4917e67de0d20ff9e8f9a27f38e1bd2abef5c503 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -33,7 +33,6 @@ from metric_op import * from learning_rate_scheduler import * __all__ = [] -__all__ += math_op_patch.__all__ __all__ += nn.__all__ __all__ += io.__all__ __all__ += tensor.__all__ diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 849474dc58461ac3772f439da7bf5d57592daa8c..782aa933f2ee86274e800045c9356d8072915fc1 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -730,8 +730,10 @@ class While(object): parent_block.append_op( type='while', inputs={ - 'X': - [parent_block.var_recursive(x_name) for x_name in x_name_list], + 'X': [ + parent_block._var_recursive(x_name) + for x_name in x_name_list + ], 'Condition': [self.cond_var] }, outputs={'Out': out_vars, @@ -1259,7 +1261,7 @@ class ConditionalBlock(object): input_set = set([ipt.name for ipt in self.inputs]) param_list = [ - parent_block.var_recursive(each_name) for each_name in params + parent_block._var_recursive(each_name) for each_name in params if each_name not in input_set ] diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index bcfc716739bb0d6fea9aa34db22473c9726d62a1..3ef4afa691b1dfba07fb132753f380727bb4f3ae 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -789,7 +789,8 @@ def prior_box(input, clip=False, steps=[0.0, 0.0], offset=0.5, - name=None): + name=None, + min_max_aspect_ratios_order=False): """ **Prior Box Operator** @@ -818,6 +819,11 @@ def prior_box(input, Default: [0., 0.] offset(float): Prior boxes center offset. Default: 0.5 name(str): Name of the prior box op. Default: None. + min_max_aspect_ratios_order(bool): If set True, the output prior box is + in order of [min, max, aspect_ratios], which is consistent with + Caffe. Please note, this order affects the weights order of + convolution layer followed by and does not affect the final + detection results. Default: False. Returns: tuple: A tuple with two Variable (boxes, variances) @@ -871,7 +877,8 @@ def prior_box(input, 'clip': clip, 'step_w': steps[0], 'step_h': steps[1], - 'offset': offset + 'offset': offset, + 'min_max_aspect_ratios_order': min_max_aspect_ratios_order } if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0: if not _is_list_or_tuple_(max_sizes): @@ -911,7 +918,8 @@ def multi_box_head(inputs, kernel_size=1, pad=0, stride=1, - name=None): + name=None, + min_max_aspect_ratios_order=False): """ Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. The details of this algorithm, please refer the @@ -954,6 +962,11 @@ def multi_box_head(inputs, pad(int|list|tuple): The padding of conv2d. Default:0. stride(int|list|tuple): The stride of conv2d. Default:1, name(str): Name of the prior box layer. Default: None. + min_max_aspect_ratios_order(bool): If set True, the output prior box is + in order of [min, max, aspect_ratios], which is consistent with + Caffe. Please note, this order affects the weights order of + convolution layer followed by and does not affect the fininal + detection results. Default: False. Returns: tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances) @@ -1068,7 +1081,8 @@ def multi_box_head(inputs, step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0] box, var = prior_box(input, image, min_size, max_size, aspect_ratio, - variance, flip, clip, step, offset) + variance, flip, clip, step, offset, None, + min_max_aspect_ratios_order) box_results.append(box) var_results.append(var) diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py index e0c1aab230aeed7fb858e91e7da7eae58032ee16..384d302a709eeec220864b9e8c9210ed028470f6 100644 --- a/python/paddle/fluid/layers/device.py +++ b/python/paddle/fluid/layers/device.py @@ -18,10 +18,12 @@ All util layers. from layer_function_generator import autodoc from ..framework import unique_name from ..layer_helper import LayerHelper +from ..annotations import deprecated -__all__ = ['get_places'] +__all__ = [] +@deprecated(since='0.15.0', instead="ParallelExecutor") @autodoc() def get_places(device_count=None, device_type=None): helper = LayerHelper('get_places', **locals()) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 977abde21f38a0d25a90bc14426fd817df2c8508..0665c09bfb52c932219be68ca801cfa951d672d3 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -22,9 +22,9 @@ from ..executor import global_scope from layer_function_generator import generate_layer_fn, templatedoc __all__ = [ - 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv', - 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', - 'double_buffer', 'random_data_generator', 'Preprocessor', 'load' + 'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', + 'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor', + 'load' ] @@ -445,6 +445,88 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): return monkey_patch_reader_methods(main_prog_var) +def py_reader(capacity, shapes, dtypes, lod_levels=None): + """ + Create a reader and blocking queue for data feeding in Python + + This layer returns a Reader Variable and a BlockingQueue. + The BlockingQueue provides `push()` method to push a `LoDTensorArray` + object into the queue in Python side. In C++ side, the Reader + Variable would invoke `pop()` method of the queue to retrieve the + feeding data. The process of feeding data in Python side and fetching + data in C++ side can run in parallel. The BlockingQueue should be closed + using `close()` method when unused. + + Args: + capacity(int): The maximum capacity of the BlockingQueue. + shapes(list): List of tuples which declaring data shapes. + dtypes(list): List of strs which declaring data type. + lod_levels(list): List of ints which declaring data lod_level. + + Returns: + tuple(Variable, BlockingQueue): + A Reader Variable from which we can get feeding data. + + A BlockingQueue object for data feeding. + + Examples: + + .. code-block:: python + + reader, queue = fluid.layers.py_reader( + capacity=10, + shapes=[[-1,3,224,224], [-1,1]], + dtypes=['float32', 'int64']) + # Via the reader, we can use 'read_file' layer to get data: + image, label = fluid.layers.read_file(reader) + + # Via the blocking queue, we can feed data using threads + def feed_data(queue, feed_images, feed_labels): + for feed_image, feed_label in zip(feed_images, feed_labels): + data = core.LoDTensorArray() + data.append(feed_image) + data.append(feed_label) + queue.push(data) + + thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels)) + thread.start() + """ + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) + + if lod_levels is None: + lod_levels = [0] * len(shapes) + + queue_name = unique_name('lod_tensor_blocking_queue') + var = global_scope().var(queue_name) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=unique_name('create_py_reader')) + startup_blk.append_op( + type='create_py_reader', + inputs={'blocking_queue': queue_name}, + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) + + return monkey_patch_reader_methods(main_prog_var), feed_queue + + def open_files(filenames, shapes, lod_levels, @@ -719,7 +801,7 @@ class Preprocessor(object): self.sink_var_names = None self.status = Preprocessor.BEFORE_SUB_BLOCK - def is_completed(self): + def _is_completed(self): return self.sub_block and self.source_var_names and self.sink_var_names @contextlib.contextmanager @@ -729,7 +811,7 @@ class Preprocessor(object): yield self.main_prog.rollback() self.status = Preprocessor.AFTER_SUB_BLOCK - if not self.is_completed(): + if not self._is_completed(): raise RuntimeError( "The definition of preprocessor is incompleted! " "Please make sure that you have set input and output " diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index 1754061c4ba6f5b97bced3548bc412dfb1b7932c..f814c41633fbac76eb9411e2f418f521e8e9679d 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -16,8 +16,6 @@ from ..framework import Variable, unique_name from layer_function_generator import OpProtoHolder from ..initializer import force_init_on_cpu -__all__ = ['monkey_patch_variable'] - def monkey_patch_variable(): def unique_tmp_name(): diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 99e82fdd04282177fae63f1fb94b5e32d41c612e..194a16b123c441ac1318b8ce58158f67e2a8093d 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -76,7 +76,7 @@ def accuracy(input, label, k=1, correct=None, total=None): return acc_out -def auc(input, label, curve='ROC', num_thresholds=200): +def auc(input, label, curve='ROC', num_thresholds=200, topk=1): """ **Area Under the Curve (AUC) Layer** @@ -102,6 +102,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'. num_thresholds(int): The number of thresholds to use when discretizing the roc curve. Default 200. + topk(int): only topk number of prediction output will be used for auc. Returns: Variable: A scalar representing the current AUC. @@ -115,7 +116,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): """ warnings.warn( - "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \ + "This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \ but can not aggregate them and get the pass AUC, because pass \ auc can not be averaged with weighted from the minibatch auc value. \ Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \ @@ -125,14 +126,34 @@ def auc(input, label, curve='ROC', num_thresholds=200): topk_indices = helper.create_tmp_variable(dtype="int64") topk_out, topk_indices = nn.topk(input, k=k) auc_out = helper.create_tmp_variable(dtype="float32") + # make tp, tn, fp, fn persistable, so that can accumulate all batches. + tp = helper.create_global_variable(persistable=True) + tn = helper.create_global_variable(persistable=True) + fp = helper.create_global_variable(persistable=True) + fn = helper.create_global_variable(persistable=True) + for var in [tp, tn, fp, fn]: + helper.set_variable_initializer( + var, Constant( + value=0.0, force_cpu=True)) + helper.append_op( type="auc", inputs={ "Out": [topk_out], "Indices": [topk_indices], - "Label": [label] + "Label": [label], + "TP": [tp], + "TN": [tn], + "FP": [fp], + "FN": [fn] }, attrs={"curve": curve, "num_thresholds": num_thresholds}, - outputs={"AUC": [auc_out], }) + outputs={ + "AUC": [auc_out], + "TPOut": [tp], + "TNOut": [tn], + "FPOut": [fp], + "FNOut": [fn] + }) return auc_out diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947..56124663929d1e33b7144ab57ae3b3c55e1652b3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1,4 +1,18 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -71,6 +85,7 @@ __all__ = [ 'transpose', 'im2sequence', 'nce', + 'hsigmoid', 'beam_search', 'row_conv', 'multiplex', @@ -3857,6 +3872,74 @@ def nce(input, return cost / (num_neg_samples + 1) +def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None): + """ + The hierarchical sigmoid operator is used to accelerate the training + process of language model. This operator organizes the classes into a + complete binary tree, each leaf node represents a class(a word) and each + internal node acts as a binary classifier. For each word there's a unique + path from root to it's leaf node, hsigmoid calculate the cost for each + internal node on the path, and sum them to get a total cost. hsigmoid can + achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N` + represents the size of word dict. + + Refer to `Hierarchical Probabilistic Neural Network Language Model + `_ + + Args: + input (Variable): The input tensor variable with shape + :math:`[N \\times D]`, where :math:`N` is the size of mini-batch, + and :math:`D` is the feature size. + label (Variable): The tensor variable contains labels of training data. + It's a tensor with shape is :math:`[N \\times 1]`. + num_classes: (int), The number of classes, must not be less than 2. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter + attribute for learnable parameters/weights of this layer. + bias_attr (ParamAttr|list of ParamAttr, default None): The parameter + attribute for the bias of this layer. If it is set to False, no + bias will be applied. + + Returns: + Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[2], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='int64') + out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6) + """ + + helper = LayerHelper('hierarchical_sigmoid', **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + pre_out = helper.create_tmp_variable(dtype) + dim = input.shape[1] + if num_classes < 2: + raise ValueError("num_classes must not be less than 2.") + weights = helper.create_parameter( + attr=helper.param_attr, + shape=[num_classes - 1, dim], + is_bias=False, + dtype=input.dtype) + inputs = {"X": input, "W": weights, "Label": label} + if helper.bias_attr: + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=[1, num_classes - 1], + is_bias=True, + dtype=input.dtype) + inputs['Bias'] = bias + helper.append_op( + type="hierarchical_sigmoid", + inputs=inputs, + outputs={"Out": out, + "PreOut": pre_out}, + attrs={"num_classes": num_classes}) + return out + + def transpose(x, perm, name=None): """ Permute the dimensions of `input` according to `perm`. @@ -3900,7 +3983,13 @@ def transpose(x, perm, name=None): return out -def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): +def im2sequence(input, + filter_size=1, + stride=1, + padding=0, + input_image_size=None, + out_stride=1, + name=None): """ Extracts image patches from the input tensor to form a tensor of shape {input.batch_size * output_height * output_width, filter_size_H * @@ -3937,6 +4026,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): padding_up = padding_down = padding_left = padding_right = padding Default: padding = 0. + input_image_size(Variable): the input contains image real size.It's dim + is [batchsize, 2]. It is dispensable.It is just for batch inference. + + out_stride(int|tuple): The scaling of image through CNN. It is + dispensable. It is valid only when input_image_size is not null. + If out_stride is tuple, it must contain two intergers, + (out_stride_H, out_stride_W). Otherwise, + the out_stride_H = out_stride_W = out_stride. + name (int): The name of this layer. It is optional. Returns: @@ -3987,7 +4085,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): [ 5. 7. 2. 4. 1. 3. 9. 0.] [ 7. 9. 4. 8. 3. 5. 0. 8.]] - output.dims = {8, 9} + output.dims = {8, 8} output.lod = [[4, 4]] @@ -4009,18 +4107,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): if len(padding) == 2: padding.append(padding[0]) padding.append(padding[1]) - + inputs = {"X": input} + attrs = {"kernels": filter_size, "strides": stride, "padding": padding} + if input_image_size: + if isinstance(out_stride, int): + out_stride = [out_stride, out_stride] + inputs["Y"] = input_image_size + attrs["out_stride"] = out_stride helper = LayerHelper('im2sequence', **locals()) out = helper.create_tmp_variable(dtype=helper.input_dtype()) helper.append_op( - type='im2sequence', - inputs={'X': input}, - outputs={'Out': out}, - attrs={ - 'kernels': filter_size, - 'strides': stride, - 'paddings': padding, - }) + type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs) return out @@ -4270,7 +4367,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): helper.set_variable_initializer( counter, initializer=Constant( value=begin - 1, force_cpu=True)) - helper.main_program.global_block().prepend_op( + helper.main_program.global_block()._prepend_op( type='increment', inputs={'X': [counter]}, outputs={'Out': [counter]}, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33..7fc8e106fb43666be9c1ea245994dc1c7ac85d7d 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -29,7 +29,7 @@ __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', - 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer' + 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer' ] @@ -67,7 +67,7 @@ class Optimizer(object): self._LARS_weight_decay = LARS_weight_decay def _create_global_learning_rate(self): - lr = self.global_learning_rate() + lr = self._global_learning_rate() if isinstance(lr, framework.Variable): return @@ -86,7 +86,7 @@ class Optimizer(object): dtype='float32' if self._dtype == None else self._dtype, persistable=True) - def global_learning_rate(self, program=None): + def _global_learning_rate(self, program=None): """ get global decayed learning rate :return: @@ -110,9 +110,9 @@ class Optimizer(object): return param_lr else: if param_lr == 1.0: - return self.global_learning_rate() + return self._global_learning_rate() else: - return self.global_learning_rate() * param_lr + return self._global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -123,7 +123,7 @@ class Optimizer(object): """ pass - def _finish_update(self, block): + def _finish_update(self, block, parameters_and_grads): """Finish any custom updates needed before completing an optimization step @@ -132,7 +132,7 @@ class Optimizer(object): parameters: list of parameter variables for the optimizer Returns: - list of finish ops or None + None """ pass @@ -185,10 +185,10 @@ class Optimizer(object): format(name, param.name)) return self._accumulators[name][param.name] - def create_optimization_pass(self, - parameters_and_grads, - loss, - startup_program=None): + def _create_optimization_pass(self, + parameters_and_grads, + loss, + startup_program=None): """Add optimization operators to update gradients to variables. Args: @@ -221,25 +221,26 @@ class Optimizer(object): self._create_global_learning_rate() if self._LARS_weight_decay > 0.0: layers.append_LARS(parameters_and_grads, - self.global_learning_rate(), + self._global_learning_rate(), self._LARS_weight_decay) optimize_ops = [] for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue with param_and_grad[0].block.program.optimized_guard( - param_and_grad[0]): - if param_and_grad[0].trainable is True and param_and_grad[ - 1] is not None: + param_and_grad): + if param_and_grad[0].trainable is True: optimize_op = self._append_optimize_op(loss.block, param_and_grad) optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block) + self._finish_update(loss.block, parameters_and_grads) end = len(global_block.ops) - return global_block.slice_ops(start, end) + return global_block._slice_ops(start, end) def minimize(self, loss, @@ -262,8 +263,8 @@ class Optimizer(object): params_grads = append_regularization_ops(params_grads, self.regularization) - optimize_ops = self.create_optimization_pass(params_grads, loss, - startup_program) + optimize_ops = self._create_optimization_pass(params_grads, loss, + startup_program) return optimize_ops, params_grads @@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer): """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" def __init__(self, learning_rate=0.001, @@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer): def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) - main_block = block.program.global_block() - # Create beta1 and beta2 power tensors - beta_shape = [1] - self._beta1_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta1_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - self.helper.set_variable_initializer( - self._beta1_pow_acc, initializer=Constant(self._beta1)) - - self._beta2_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta2_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - - self.helper.set_variable_initializer( - self._beta2_pow_acc, initializer=Constant(self._beta2)) - # Create accumulator tensors for first and second moments for p in parameters: self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta1, + shape=[1]) + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta2, + shape=[1]) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer): param_and_grad[0]) moment2 = self._get_accumulator(self._moment2_acc_str, param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + # create the adam optimize op adam_op = block.append_op( type=self.type, @@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer): "LearningRate": self._create_param_lr(param_and_grad), "Moment1": moment1, "Moment2": moment2, - "Beta1Pow": self._beta1_pow_acc, - "Beta2Pow": self._beta2_pow_acc + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc }, outputs={ "ParamOut": param_and_grad[0], @@ -566,24 +564,30 @@ class AdamOptimizer(Optimizer): return adam_op - def _finish_update(self, block): + def _finish_update(self, block, param_and_grads): """Update Beta1 and Beta2 Power accumulators """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - scale_beta1 = main_block.append_op( - type="scale", - inputs={"X": self._beta1_pow_acc}, - outputs={"Out": self._beta1_pow_acc}, - attrs={"scale": self._beta1}) - - scale_beta2 = main_block.append_op( - type="scale", - inputs={"X": self._beta2_pow_acc}, - outputs={"Out": self._beta2_pow_acc}, - attrs={"scale": self._beta2}) - - return [scale_beta1, scale_beta2] + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) + + main_block.append_op( + type="scale", + inputs={"X": beta2_pow_acc}, + outputs={"Out": beta2_pow_acc}, + attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): @@ -626,6 +630,7 @@ class AdamaxOptimizer(Optimizer): """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" + _beta1_pow_acc_str = "beta1_pow_acc" def __init__(self, learning_rate=0.001, @@ -645,21 +650,16 @@ class AdamaxOptimizer(Optimizer): self._epsilon = epsilon def _create_accumulators(self, block, parameters): - # Create beta1 power accumulator tensor - beta_shape = [1] - self._beta1_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta1_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - self.helper.set_variable_initializer( - self._beta1_pow_acc, initializer=Constant(self._beta1)) - # Create accumulator tensors for first moment and infinity norm for p in parameters: self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta1, + shape=[1]) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -667,6 +667,8 @@ class AdamaxOptimizer(Optimizer): moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) inf_norm = self._get_accumulator(self._inf_norm_acc_str, param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) # create the adamax optimize op adamax_op = block.append_op( type=self.type, @@ -676,7 +678,7 @@ class AdamaxOptimizer(Optimizer): "LearningRate": self._create_param_lr(param_and_grad), "Moment": moment, "InfNorm": inf_norm, - "Beta1Pow": self._beta1_pow_acc + "Beta1Pow": beta1_pow_acc }, outputs={ "ParamOut": param_and_grad[0], @@ -691,18 +693,22 @@ class AdamaxOptimizer(Optimizer): return adamax_op - def _finish_update(self, block): + def _finish_update(self, block, parameters_and_grads): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - scale_beta1 = main_block.append_op( - type="scale", - inputs={"X": self._beta1_pow_acc}, - outputs={"Out": self._beta1_pow_acc}, - attrs={"scale": self._beta1}) - - return [scale_beta1] + for param, grad in parameters_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) class DecayedAdagradOptimizer(Optimizer): @@ -1156,7 +1162,10 @@ class ModelAverage(Optimizer): self.params_grads.append((param, grad)) for param, grad in self.params_grads: - self._append_average_accumulate_op(param) + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): + self._append_average_accumulate_op(param) self.apply_program = Program() block = self.apply_program.global_block() @@ -1171,16 +1180,16 @@ class ModelAverage(Optimizer): self._add_average_restore_op(block, param_grad) def _add_average_apply_op(self, block, param_grad): - param = block.clone_variable(param_grad[0]) - grad = block.clone_variable(param_grad[1]) - sum_1 = block.clone_variable(self._get_accumulator('sum_1', param)) - sum_2 = block.clone_variable(self._get_accumulator('sum_2', param)) - sum_3 = block.clone_variable(self._get_accumulator('sum_3', param)) - num_accumulates = block.clone_variable( + param = block._clone_variable(param_grad[0]) + grad = block._clone_variable(param_grad[1]) + sum_1 = block._clone_variable(self._get_accumulator('sum_1', param)) + sum_2 = block._clone_variable(self._get_accumulator('sum_2', param)) + sum_3 = block._clone_variable(self._get_accumulator('sum_3', param)) + num_accumulates = block._clone_variable( self._get_accumulator('num_accumulates', param)) - old_num_accumulates = block.clone_variable( + old_num_accumulates = block._clone_variable( self._get_accumulator('old_num_accumulates', param)) - num_updates = block.clone_variable( + num_updates = block._clone_variable( self._get_accumulator('num_updates', param)) # backup param value to grad layers.assign(input=param, output=grad) @@ -1194,8 +1203,8 @@ class ModelAverage(Optimizer): layers.elementwise_div(x=sum, y=tmp, out=param) def _add_average_restore_op(self, block, param_grad): - param = block.clone_variable(param_grad[0]) - grad = block.clone_variable(param_grad[1]) + param = block._clone_variable(param_grad[0]) + grad = block._clone_variable(param_grad[1]) layers.assign(input=grad, output=param) def _append_average_accumulate_op(self, param): diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 6baf648198585022f992709c519038688af293e1..10028a8c6e33edcea27650d925ca7378b770f143 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -152,7 +152,7 @@ class ParallelExecutor(object): self.executor = core.ParallelExecutor( self._places, set([ - p.name for p in main.global_block().iter_parameters() + p.name for p in main.global_block()._iter_parameters() if not p.stop_gradient ]), set(self.persistable_vars), main.desc, loss_name diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index 0a42b9fca8dba7a11b414990be6c04c93158864f..4a61f85ec4b5c5108ded31632af75dbbdaaaba71 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -67,7 +67,7 @@ class ParamAttr(object): self.gradient_clip = gradient_clip self.model_average = do_model_average - def set_default_initializer(self, initializer): + def _set_default_initializer(self, initializer): """ Set the default initializer, the initializer should be Constant, Uniform, Normal, Xavier, MSRA. @@ -88,7 +88,7 @@ class ParamAttr(object): self.initializer = initializer - def set_default_param_initializer(self): + def _set_default_param_initializer(self): """ Set the default initializer for the parameter with Xavier. @@ -98,9 +98,9 @@ class ParamAttr(object): Returns: None. """ - self.set_default_initializer(Xavier()) + self._set_default_initializer(Xavier()) - def set_default_bias_initializer(self): + def _set_default_bias_initializer(self): """ Set the default initializer for the bias with Constant(0.0). @@ -110,10 +110,10 @@ class ParamAttr(object): Returns: None. """ - self.set_default_initializer(Constant(0.0)) + self._set_default_initializer(Constant(0.0)) @staticmethod - def to_attr(arg): + def _to_attr(arg): """ Create ParamAttr[s]. @@ -131,7 +131,7 @@ class ParamAttr(object): if arg is None: return ParamAttr() elif isinstance(arg, list) or isinstance(arg, tuple): - return [ParamAttr.to_attr(a) for a in arg] + return [ParamAttr._to_attr(a) for a in arg] elif isinstance(arg, ParamAttr): return arg elif isinstance(arg, str) or isinstance(arg, unicode): @@ -141,11 +141,11 @@ class ParamAttr(object): elif isinstance(arg, WeightDecayRegularizer): return ParamAttr(regularizer=arg) elif isinstance(arg, bool): - return ParamAttr.to_attr(None) if arg else False + return ParamAttr._to_attr(None) if arg else False else: raise TypeError("{0} cast to ParamAttr".format(type(arg))) - def to_kwargs(self, with_initializer=False): + def _to_kwargs(self, with_initializer=False): """ Returns the attributes of this parameter. diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index dac474d5ee76590a75311d6bf2c4cb2fe85b6c40..080c185420bdc79d6da1d5a52fdd11fa4105d59a 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -15,10 +15,7 @@ import framework from . import core -__all__ = [ - 'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer', - 'L2DecayRegularizer' -] +__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer'] def append_regularization_ops(parameters_and_grads, regularization=None): @@ -44,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None): """ params_and_grads = [] for param, grad in parameters_and_grads: - with param.block.program.optimized_guard(param): - # If no gradient then we don't need to do anything - if grad is None: - params_and_grads.append((param, grad)) - continue - + # If no gradient then we don't need to do anything + if grad is None: + params_and_grads.append((param, grad)) + continue + with param.block.program.optimized_guard([param, grad]): regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index 1df7b99aad6094a8b8ddfe783b9de35cef61c524..95002aa7f9bb639828b47eb1e86e4ef954fb85ff 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function - +from paddle.fluid.layers.device import get_places import unittest import paddle.fluid as fluid import paddle @@ -144,7 +144,7 @@ def train(word_dict, cost, acc_out, prediction = net_method( data, label, input_dim=dict_dim, class_dim=class_dim) else: - places = fluid.layers.get_places() + places = get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): cost, acc, _ = net_method( diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 5f5c8544bbdb87421f129b201a0ebaf4cb8602a1..49f549fa184037a64aa846f0d1d0e1b57db1f2ef 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -12,15 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function -import argparse -import paddle.fluid as fluid -import paddle -import sys -import numpy -import unittest + import math -import sys import os +import sys +import unittest + +import numpy + +import paddle +import paddle.fluid as fluid +from paddle.fluid.layers.device import get_places BATCH_SIZE = 64 @@ -76,7 +78,7 @@ def train(nn_type, net_conf = conv_net if parallel: - places = fluid.layers.get_places() + places = get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): img_ = pd.read_input(img) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 49bd72c7a53c0ae740bdbabe15b1d37340699d41..80e0692bc640efc280c43bd5b929847ad29207c4 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -14,6 +14,7 @@ import paddle import paddle.fluid as fluid +from paddle.fluid.layers.device import get_places import unittest import os import numpy as np @@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): avg_cost, predict_word = __network__( [first_word, second_word, third_word, forth_word, next_word]) else: - places = fluid.layers.get_places() + places = get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): avg_cost, predict_word = __network__( diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index be347cd5315668dde0454d7959dbf9bcfa465b5f..bec9f8594ff7c1aff8ae5ed55c9623754d9ea091 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import paddle -import paddle.fluid as fluid import math import sys +import paddle +import paddle.fluid as fluid +from paddle.fluid.layers.device import get_places + # need to fix random seed and training data to compare the loss # value accurately calculated by the default and the memory optimization # version. @@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda(): use_nccl = False place = fluid.CUDAPlace(0) -places = fluid.layers.get_places(device_count=0, device_type=device_type) +places = get_places(device_count=0, device_type=device_type) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) with pd.do(): x_ = pd.read_input(x) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index e056ef9952a519d6c4d580b27f1118a3a91f13af..6824ede82b74c4e9783682149db870a471c35079 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -60,8 +60,8 @@ def get_numeric_gradient(place, return np.array(sum).mean() tensor_to_check = scope.find_var(input_to_check).get_tensor() - tensor_size = product(tensor_to_check.get_dims()) - tensor_to_check_dtype = tensor_to_check.dtype() + tensor_size = product(tensor_to_check.shape()) + tensor_to_check_dtype = tensor_to_check._dtype() if tensor_to_check_dtype == core.VarDesc.VarType.FP32: tensor_to_check_dtype = np.float32 elif tensor_to_check_dtype == core.VarDesc.VarType.FP64: @@ -74,15 +74,15 @@ def get_numeric_gradient(place, def __get_elem__(tensor, i): if tensor_to_check_dtype == np.float32: - return tensor.get_float_element(i) + return tensor._get_float_element(i) else: - return tensor.get_double_element(i) + return tensor._get_double_element(i) def __set_elem__(tensor, i, e): if tensor_to_check_dtype == np.float32: - tensor.set_float_element(i, e) + tensor._set_float_element(i, e) else: - tensor.set_double_element(i, e) + tensor._set_double_element(i, e) # we only compute gradient of one element each time. # we use a for loop to compute the gradient of every element. @@ -107,7 +107,7 @@ def get_numeric_gradient(place, __set_elem__(tensor_to_check, i, origin) gradient_flat[i] = (y_pos - y_neg) / delta / 2 - return gradient_flat.reshape(tensor_to_check.get_dims()) + return gradient_flat.reshape(tensor_to_check.shape()) class OpTest(unittest.TestCase): @@ -125,7 +125,7 @@ class OpTest(unittest.TestCase): @classmethod def tearDownClass(cls): - '''Restore random seeds''' + """Restore random seeds""" np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index cddf00765f4894126988c794763c34629449e8e6..fcf86cc5839113b75855ce97459b2ee4881238cd 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase): feed_dict=None, seed=None, use_parallel_executor=True, - balance_parameter_opt_between_cards=False): + use_reduce=False, + optimizer=fluid.optimizer.Adam): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase): main = fluid.Program() startup = fluid.Program() startup.random_seed = 1 # Fix random seed + main.random_seed = 1 with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed + main.random_seed = seed + loss = method(use_feed=feed_dict is not None) - adam = fluid.optimizer.Adam() - adam.minimize(loss) + + optimizer().minimize(loss) + if memory_opt: fluid.memory_optimize(main) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) @@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_parallel_executor: exe = fluid.ParallelExecutor( diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 948836039be48ad74d5556100f06231bb89f26d3..6bd5e2332a99693f5e53e147491aa83c35859548 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -24,7 +24,20 @@ class TestAucOp(OpTest): indices = np.random.randint(0, 2, (128, 2)) labels = np.random.randint(0, 2, (128, 1)) num_thresholds = 200 - self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels} + tp = np.zeros((num_thresholds, )).astype("int64") + tn = np.zeros((num_thresholds, )).astype("int64") + fp = np.zeros((num_thresholds, )).astype("int64") + fn = np.zeros((num_thresholds, )).astype("int64") + + self.inputs = { + 'Out': pred, + 'Indices': indices, + 'Label': labels, + 'TP': tp, + 'TN': tn, + 'FP': fp, + 'FN': fn + } self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} # NOTE: sklearn use a different way to generate thresholds # which will cause the result differs slightly: @@ -71,7 +84,13 @@ class TestAucOp(OpTest): y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 auc_value = np.sum(x * y) - self.outputs = {'AUC': auc_value} + self.outputs = { + 'AUC': auc_value, + 'TPOut': tp_list, + 'FNOut': fn_list, + 'TNOut': tn_list, + 'FPOut': fp_list + } def test_check_output(self): self.check_output() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index a62ee9596d0f6c58135b4a13249b638e84e63c3c..fcb2612326e74cf6417aa93f2691154c79b5e44c 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -129,7 +129,6 @@ def create_or_get_tensor(scope, var_name, var, place): if var is not None: assert isinstance(var, np.ndarray) tensor.set_recursive_sequence_lengths([]) - tensor.set_dims(var.shape) tensor.set(var, place) return tensor diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py index 06e676cd83e77549afd679e730426c590cc046bf..7f2a9e6971ed933463216e38498d48ab132a1a37 100644 --- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py @@ -16,8 +16,6 @@ import unittest import paddle.fluid as fluid import paddle.fluid.layers as layers -import paddle.fluid.framework as framework -import paddle.fluid.optimizer as optimizer from paddle.fluid.backward import calc_gradient diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py deleted file mode 100644 index e22400a045ced16c46b0bf005155f621f249d263..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/unittests/test_checkpoint.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -import unittest -import os -import tempfile - - -class TestCheckpoint(unittest.TestCase): - def setUp(self): - self.dirname = tempfile.mktemp() - self.max_num_checkpoints = 3 - self.epoch_interval = 1 - self.step_interval = 1 - self.trainer_id = 0 - self.chief = self.trainer_id == 0 - self.place = fluid.CPUPlace() - self.epoch_id = 100 - self.step_id = 20 - - def test_checkpoint(self): - self.save_checkpoint() - serial = fluid.io.get_latest_checkpoint_serial(self.dirname) - self.assertTrue(serial >= 0) - trainer_args = ["epoch_id", "step_id"] - epoch_id, step_id = fluid.io.load_trainer_args( - self.dirname, serial, self.trainer_id, trainer_args) - self.assertEqual(self.step_id, int(step_id)) - self.assertEqual(self.epoch_id, int(epoch_id)) - - program = fluid.Program() - with fluid.program_guard(program): - exe = fluid.Executor(self.place) - fluid.io.load_checkpoint(exe, self.dirname, serial, program) - - fluid.io.clean_checkpoint(self.dirname, delete_dir=True) - self.assertFalse(os.path.isdir(self.dirname)) - - def save_checkpoint(self): - config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints, - self.epoch_interval, self.step_interval) - - trainer_args = {} - trainer_args["epoch_id"] = self.epoch_id - trainer_args["step_id"] = self.step_id - - program = fluid.Program() - with fluid.program_guard(program): - program.global_block().create_var( - name="scale_0", - psersistable=True, - dtype="float32", - shape=[32, 32]) - - exe = fluid.Executor(self.place) - for i in xrange(10): - fluid.io.save_checkpoint(exe, config.checkpoint_dir, - self.trainer_id, trainer_args, program, - config.max_num_checkpoints) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 562e66b0625083fe840d64967249f0215cfda1f9..aab8969a96ff69d1a306506337a0e009f14758b9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -22,6 +22,9 @@ import numpy import paddle.fluid as fluid import paddle.fluid.layers as layers +from paddle.fluid.layers.io import ListenAndServ +from paddle.fluid.layers.io import Recv +from paddle.fluid.layers.io import Send class TestSendOp(unittest.TestCase): @@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase): main = fluid.Program() with fluid.program_guard(main): - serv = layers.ListenAndServ( - "127.0.0.1:0", ["X"], optimizer_mode=False) + serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False) with serv.do(): out_var = main.global_block().create_var( name="scale_0.tmp_0", @@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase): persistable=False, shape=[32, 32]) fluid.initializer.Constant(value=2.3)(get_var, main.global_block()) - layers.Send("127.0.0.1:%d" % port, [x]) - o = layers.Recv("127.0.0.1:%d" % port, [get_var]) + Send("127.0.0.1:%d" % port, [x]) + o = Recv("127.0.0.1:%d" % port, [get_var]) exe = fluid.Executor(place) self.dist_out = exe.run(main, fetch_list=o) # o is a list diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 75b4b4e50da04521021dcb1e97cfe495f2619433..9dbef0693bb129186dfc50f6efdd0896deedda81 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -27,7 +27,6 @@ class TranspilerTest(unittest.TestCase): self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175" self.pserver1_ep = "127.0.0.1:6174" self.pserver2_ep = "127.0.0.1:6175" - self.slice_var_up = True self.sync_mode = True self.transpiler = None @@ -52,27 +51,26 @@ class TranspilerTest(unittest.TestCase): self.origin_prog = main.clone() return main - def get_trainer(self): - t = self._transpiler_instance() + def get_trainer(self, config=None): + t = self._transpiler_instance(config) return t.get_trainer_program() - def get_pserver(self, ep): - t = self._transpiler_instance() + def get_pserver(self, ep, config=None): + t = self._transpiler_instance(config) pserver = t.get_pserver_program(ep) startup = t.get_startup_program(ep, pserver) return pserver, startup - def _transpiler_instance(self): + def _transpiler_instance(self, config=None): if not self.transpiler: main = self.get_main_program() - self.transpiler = fluid.DistributeTranspiler() + self.transpiler = fluid.DistributeTranspiler(config=config) self.transpiler.transpile( self.trainer_id, program=main, pservers=self.pserver_eps, - trainers=self.trainers, - slice_var_up=self.slice_var_up, - sync_mode=self.sync_mode) + trainers=self.trainers) + return self.transpiler @@ -124,14 +122,67 @@ class TestBasicModel(TranspilerTest): self.assertEqual(set(pserver_params), set(trainer_params)) +class TestBasicModelWithLargeBlockSize(TranspilerTest): + def test_transpiler(self): + config = fluid.DistributeTranspilerConfig() + config.min_block_size = 1048576 + + pserver, startup = self.get_pserver(self.pserver1_ep, config) + pserver2, startup2 = self.get_pserver(self.pserver2_ep, config) + + trainer = self.get_trainer(config) + + self.assertEqual([op.type for op in trainer.global_block().ops], [ + 'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean', + 'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad', + 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'send_barrier', + 'recv', 'recv', 'fetch_barrier' + ]) + + self.assertEqual(len(pserver.blocks), 2) + # block0: listen_and_serv + self.assertEqual([op.type for op in pserver.blocks[0].ops], + ["listen_and_serv"]) + # block1~2: optimize pass + self.assertEqual([op.type for op in pserver.blocks[1].ops], + ["sum", "scale", "sgd"]) + # confirm startup program + self.assertEqual([op.type for op in startup.global_block().ops], + ["fill_constant", "fill_constant", "fill_constant"]) + # the variable #fc_w will be split into two blocks + fc_w_var = startup2.global_block().var("fc_w") + self.assertEqual(fc_w_var.shape, (1000L, 1000L)) + # all parameters should be optimized on pserver + + pserver_params = [] + for prog in [pserver, pserver2]: + for blk in prog.blocks: + for op in blk.ops: + if "Param" in op.input_names: + param_name = op.input("Param")[0] + is_block_idx = param_name.find(".block") + if is_block_idx != -1: + origin_param_name = param_name[:is_block_idx] + else: + origin_param_name = param_name + pserver_params.append(origin_param_name) + trainer_params = [] + for op in self.origin_prog.global_block().ops: + if "Param" in op.input_names: + trainer_params.append(op.input("Param")[0]) + self.assertEqual(set(pserver_params), set(trainer_params)) + + class TestNoSliceVar(TranspilerTest): def setUp(self): super(TestNoSliceVar, self).setUp() - self.slice_var_up = False def test_transpiler(self): - _, startup = self.get_pserver(self.pserver1_ep) - _, startup2 = self.get_pserver(self.pserver2_ep) + config = fluid.DistributeTranspilerConfig() + config.slice_var_up = False + + _, startup = self.get_pserver(self.pserver1_ep, config) + _, startup2 = self.get_pserver(self.pserver2_ep, config) if startup.global_block().vars.has_key("fc_w"): fc_w_var = startup.global_block().vars["fc_w"] @@ -253,10 +304,50 @@ class TestL2Decay(TranspilerTest): # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer - # FIXME(typhoonzero): need to add test for async case: - # see https://github.com/PaddlePaddle/Paddle/issues/11691 -class TestAsyncSGD(TranspilerTest): - pass +class TestL2DecayWithPiecewise(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + base_lr = 1.0 + bd = [1, 10, 20, 30] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + sgd_optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + sgd_optimizer.minimize(avg_cost) + return + + def test_transpiler(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer = self.get_trainer() + + self.assertEqual(len(pserver.blocks), 9) + self.assertEqual([op.type for op in pserver.blocks[1].ops], [ + "increment", "cast", "fill_constant", "fill_constant", "less_than", + "logical_not", "conditional_block", "fill_constant", + "fill_constant", "less_than", "logical_not", "logical_and", + "logical_and", "conditional_block", "fill_constant", + "fill_constant", "less_than", "logical_not", "logical_and", + "logical_and", "conditional_block", "fill_constant", + "fill_constant", "less_than", "logical_not", "logical_and", + "logical_and", "conditional_block", "fill_constant", + "conditional_block" + ]) + self.assertEqual( + [op.type for op in pserver.blocks[7].ops], + ["sum", "scale", "scale", "elementwise_add", "momentum"]) + self.assertEqual( + [op.type for op in pserver.blocks[8].ops], + ["sum", "scale", "scale", "elementwise_add", "momentum"]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py index 92e718662dfd7998be3ede2994f160059679fa8a..31af1245720405ee067a0acf3575e3ae86372c13 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py @@ -65,10 +65,10 @@ class TestDyRnnStaticInput(unittest.TestCase): return self._lodtensor_to_ndarray(fetch_outs[0]) def _lodtensor_to_ndarray(self, lod_tensor): - dims = lod_tensor.get_dims() + dims = lod_tensor.shape() ndarray = np.zeros(shape=dims).astype('float32') for i in xrange(np.product(dims)): - ndarray.ravel()[i] = lod_tensor.get_float_element(i) + ndarray.ravel()[i] = lod_tensor._get_float_element(i) return ndarray, lod_tensor.recursive_sequence_lengths() def build_graph(self, only_forward=False): @@ -185,19 +185,19 @@ class TestDyRnnStaticInput(unittest.TestCase): actual_gradients, actual_lod = self.fetch_value(static_input_grad) - static_input_shape = self.static_input_tensor.get_dims() + static_input_shape = self.static_input_tensor.shape() numeric_gradients = np.zeros(shape=static_input_shape).astype('float32') # calculate numeric gradients tensor_size = np.product(static_input_shape) for i in xrange(tensor_size): - origin = self.static_input_tensor.get_float_element(i) + origin = self.static_input_tensor._get_float_element(i) x_pos = origin + self._delta - self.static_input_tensor.set_float_element(i, x_pos) + self.static_input_tensor._set_float_element(i, x_pos) y_pos = self.fetch_value(loss)[0][0] x_neg = origin - self._delta - self.static_input_tensor.set_float_element(i, x_neg) + self.static_input_tensor._set_float_element(i, x_neg) y_neg = self.fetch_value(loss)[0][0] - self.static_input_tensor.set_float_element(i, origin) + self.static_input_tensor._set_float_element(i, origin) numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2 self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001)) self.assertTrue( diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py new file mode 100644 index 0000000000000000000000000000000000000000..6c6aa9d3bb656740c528c728efafc6a47e8bff91 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestFakeQuantizeOp(OpTest): + def setUp(self): + self.op_type = "fake_quantize" + self.attrs = { + 'bit_length': 8, + 'quantize_type': 'abs_max', + 'window_size': 10000 + } + self.inputs = { + 'X': np.random.random((10, 10)).astype("float32"), + 'InScales': np.zeros(self.attrs['window_size']).astype("float32"), + 'InCurrentIter': np.zeros(1).astype("float32"), + 'InMovingScale': np.zeros(1).astype("float32") + } + self.scale = { + 'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32") + } + self.outputs = { + 'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * ( + (1 << (self.attrs['bit_length'] - 1)) - 1)), + 'OutScales': np.zeros(self.attrs['window_size']).astype("float32"), + 'OutMovingScale': + np.array([self.scale['abs_max']]).astype("float32"), + 'OutCurrentIter': np.zeros(1).astype("float32") + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_get_places_op.py b/python/paddle/fluid/tests/unittests/test_get_places_op.py index 6dab1e22f0c50ab011d6b8e8944097600cf3fecc..964423e2d2638224244b4ca774d8eee08f3ec989 100644 --- a/python/paddle/fluid/tests/unittests/test_get_places_op.py +++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle.fluid as fluid +from paddle.fluid.layers.device import get_places import decorators import unittest @@ -20,7 +21,7 @@ import unittest class TestGetPlaces(unittest.TestCase): @decorators.prog_scope() def test_get_places(self): - places = fluid.layers.get_places() + places = get_places() cpu = fluid.CPUPlace() exe = fluid.Executor(cpu) exe.run(fluid.default_main_program()) diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py new file mode 100644 index 0000000000000000000000000000000000000000..d090960c84e47da68a0ebea4609dfc3ed76e114e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -0,0 +1,99 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import math +from op_test import OpTest + + +def find_latest_set(num): + return 1 + int(math.floor(math.log(num, 2))) + + +class CodeTable(object): + def __init__(self, num_classes, code): + self.c = num_classes + code + + def cal_index(self, bit): + return (self.c >> (bit + 1)) - 1 + + def get_length(self): + return find_latest_set(self.c) - 1 + + def cal_bit(self, bit): + return self.c & (1 << bit) + + +def hsigmoid(x, w, label, bias, num_classes): + batch_size = x.shape[0] + code_length = find_latest_set(num_classes - 1) + code_table = [0 for _ in range(code_length)] + pre_output = np.zeros((batch_size, code_length)) + pre_sum = np.zeros((batch_size, 1)) + out = np.zeros((batch_size, 1)).astype("float32") + for i in range(batch_size): + code_table = CodeTable(num_classes, label[i]) + length = code_table.get_length() + for j in range(length): + idx = code_table.cal_index(j) + pre_output[i][j] += bias[0][idx] + for i in range(batch_size): + code_table = CodeTable(num_classes, label[i]) + length = code_table.get_length() + for j in range(length): + idx = code_table.cal_index(j) + pre_output[i][j] += np.dot(w[idx], x[i]) + # clip[-40.0, 40.0] + pre_output = np.clip(pre_output, -40.0, 40.0) + # out(i, 0) = \sum_j bit(i, j) * preout(i, j) + for i in range(batch_size): + code_table = CodeTable(num_classes, label[i]) + length = code_table.get_length() + sum = 0.0 + for j in range(length): + if code_table.cal_bit(j): + sum += pre_output[i][j] + out[i] = -1.0 * sum + # soft relu + pre_output = np.log(1 + np.exp(pre_output)) + pre_sum = pre_output.sum(1).reshape((batch_size, 1)) + out += pre_sum + return pre_output, out + + +class TestHSigmoidOp(OpTest): + def setUp(self): + self.op_type = "hierarchical_sigmoid" + num_classes = 6 + feature_size = 8 + batch_size = 4 + x = np.random.random((batch_size, feature_size)).astype("float32") + w = np.random.random((num_classes - 1, feature_size)).astype("float32") + label = np.random.randint(0, num_classes, (batch_size, 1)) + bias = np.random.random((1, num_classes - 1)).astype("float32") + self.attrs = {'num_classes': num_classes} + self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} + pre_output, out = hsigmoid(x, w, label, bias, num_classes) + self.outputs = {'PreOut': pre_output, 'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py index 4946475f11a4fc0ccaffeec6821d3976ea7c6560..13bc5768740ece00bbe285a0b47d82bb8a42d2c7 100644 --- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py +++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py @@ -16,23 +16,48 @@ import numpy as np from op_test import OpTest -def get_output_shape(attrs, in_shape): +def get_output_shape(attrs, in_shape, img_real_size): + batchsize = in_shape[0] img_height = in_shape[2] img_width = in_shape[3] + paddings = np.array(attrs['paddings']).astype("int32") + kernels = np.array(attrs['kernels']).astype("int32") + strides = np.array(attrs['strides']).astype("int32") + output_height = np.zeros((1, batchsize)).astype("int32") + output_width = np.zeros((1, batchsize)).astype("int32") + if len(img_real_size): + out_stride = np.array(attrs['out_stride']).astype("int32") + imgreal_h = 0 + imgreal_w = 0 + for index in range(batchsize): + if img_real_size[index, 0] % out_stride[0] == 0: + imgreal_h = img_real_size[index, 0] / out_stride[0] + else: + imgreal_h = img_real_size[index, 0] / out_stride[0] + 1 + if img_real_size[index, 0] % out_stride[1] == 0: + imgreal_w = img_real_size[index, 1] / out_stride[1] + else: + imgreal_w = img_real_size[index, 0] / out_stride[1] + 1 + output_height[0,index] = \ + 1 + \ + (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ + strides[0] - paddings = attrs['paddings'] - kernels = attrs['kernels'] - strides = attrs['strides'] + output_width[0,index] = \ + 1 + \ + (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ + strides[1] + else: + for index in range(batchsize): + output_height[0,index] = \ + 1 + \ + (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ + strides[0] - output_height = \ - 1 + \ - (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ - strides[0] - - output_width = \ - 1 + \ - (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ - strides[1] + output_width[0,index] = \ + 1 + \ + (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ + strides[1] return output_height, output_width @@ -75,22 +100,25 @@ def im2col(attrs, im, col): im_row_offset][im_col_offset] -def Im2Sequence(inputs, attrs): - output_height, output_width = get_output_shape(attrs, inputs.shape) +def Im2Sequence(inputs, img_real_size, attrs): + output_height, output_width = get_output_shape(attrs, inputs.shape, + img_real_size) img_channels = inputs.shape[1] batch_size = inputs.shape[0] - out = np.zeros([ - batch_size, output_height, output_width, img_channels, - attrs['kernels'][0], attrs['kernels'][1] - ]).astype("float32") - - for i in range(len(inputs)): - im2col(attrs, inputs[i], out[i]) - - out = out.reshape([ - batch_size * output_height * output_width, - img_channels * attrs['kernels'][0] * attrs['kernels'][1] - ]) + out = [] + for index in range(batch_size): + tmp = np.zeros([ + output_height[0, index], output_width[0, index], img_channels, + attrs['kernels'][0], attrs['kernels'][1] + ]).astype("float32") + out.append(tmp) + for index in range(len(inputs)): + im2col(attrs, inputs[index], out[index]) + out[index] = out[index].reshape([ + output_height[0, index] * output_width[0, index], + img_channels * attrs['kernels'][0] * attrs['kernels'][1] + ]) + out = np.concatenate(out, axis=0) return out @@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest): self.attrs = { 'kernels': [2, 2], 'strides': [1, 1], - 'paddings': [1, 1, 1, 1] + 'paddings': [1, 1, 1, 1], } def setUp(self): @@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest): self.batch_size, self.img_channels, self.img_height, self.img_width ]).astype("float32") - out = Im2Sequence(x, self.attrs) + real_size = np.array([]).astype("float32") + out = Im2Sequence(x, real_size, self.attrs) self.inputs = {'X': x} self.outputs = {'Out': out} @@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp): self.attrs = { 'kernels': [2, 1], 'strides': [2, 1], - 'paddings': [2, 1, 2, 1] + 'paddings': [2, 1, 2, 1], } class TestBlockExpandOpCase3(TestBlockExpandOp): def config(self): - self.batch_size = 3 + self.batch_size = 2 self.img_channels = 1 self.img_height = 4 self.img_width = 5 self.attrs = { 'kernels': [2, 1], 'strides': [2, 1], - 'paddings': [2, 0, 2, 0] + 'paddings': [2, 0, 2, 0], } @@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp): self.attrs = { 'kernels': [2, 2], 'strides': [1, 1], - 'paddings': [0, 0, 0, 0] + 'paddings': [0, 0, 0, 0], + } + + +class TestBlockExpandOpCase5(OpTest): + def config(self): + self.batch_size = 1 + self.img_channels = 3 + self.img_height = 4 + self.img_width = 5 + self.attrs = { + 'kernels': [2, 1], + 'strides': [2, 1], + 'paddings': [2, 1, 2, 1], + 'out_stride': [2, 2], + } + + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[8, 10], [5, 8]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} #l ?? + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +class TestBlockExpandOpCase6(OpTest): + def config(self): + self.batch_size = 3 + self.img_channels = 1 + self.img_height = 4 + self.img_width = 5 + self.attrs = { + 'kernels': [2, 1], + 'strides': [1, 1], + 'paddings': [0, 0, 0, 0], + 'out_stride': [1, 1], + } + + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} #l ?? + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +class TestBlockExpandOpCase7(OpTest): + def config(self): + self.batch_size = 2 + self.img_channels = 2 + self.img_height = 3 + self.img_width = 3 + self.attrs = { + 'kernels': [2, 2], + 'strides': [1, 1], + 'paddings': [1, 0, 1, 0], + 'out_stride': [2, 2], } + def setUp(self): + self.config() + self.op_type = "im2sequence" + x = np.random.uniform(0.1, 1, [ + self.batch_size, self.img_channels, self.img_height, self.img_width + ]).astype("float32") + real_size = np.array([[6, 6], [4, 4]]).astype("float32") + out = np.array(Im2Sequence(x, real_size, self.attrs)) + self.inputs = {'X': x, 'Y': real_size} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + if __name__ == '__main__': unittest.main() +#set shiftwidth=4 set expandtab set tabstop=4 diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 842d34c07e94a79e3351347e2528ecc478cc56dc..6b1f206ea2f5a6226cfdb01c70a8ce4646ae4788 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -16,6 +16,7 @@ from __future__ import print_function import unittest import paddle.fluid.layers as layers +from paddle.fluid.layers.device import get_places import paddle.fluid.nets as nets from paddle.fluid.framework import Program, program_guard, default_main_program from paddle.fluid.param_attr import ParamAttr @@ -173,6 +174,16 @@ class TestBook(unittest.TestCase): x=dat, label=lbl)) print(str(program)) + def test_hsigmoid(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[2], dtype='float32') + y = layers.data(name='y', shape=[2], dtype='int64') + self.assertIsNotNone( + layers.hsigmoid( + input=x, label=y, num_classes=2)) + print(str(program)) + def test_sequence_expand(self): program = Program() with program_guard(program): @@ -238,7 +249,7 @@ class TestBook(unittest.TestCase): def test_get_places(self): program = Program() with program_guard(program): - x = layers.get_places(device_count=4) + x = get_places(device_count=4) self.assertIsNotNone(x) print(str(program)) @@ -251,12 +262,16 @@ class TestBook(unittest.TestCase): print(str(program)) def test_im2sequence(self): - print("test_im2sequence") program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 128, 128], dtype='float32') + y = layers.data(name='y', shape=[], dtype='float32') output = layers.im2sequence( - input=x, stride=[1, 1], filter_size=[2, 2]) + input=x, + input_image_size=y, + stride=[1, 1], + filter_size=[2, 2], + out_stride=[1, 1]) self.assertIsNotNone(output) print(str(program)) diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 7286c7c450108c4b5ad7136041bc4e989894a2ba..18921d727f94a85b69259c07273f09c3e19390c6 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer.create_optimization_pass( + opts = momentum_optimizer._create_optimization_pass( params_grads, mul_out, init_program) self.assertEqual(len(opts), 3) sgd_op = opts[-1] @@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer.create_optimization_pass( + opts = momentum_optimizer._create_optimization_pass( params_grads, mul_out, init_program) self.assertEqual(len(opts), 3) sgd_op = opts[-1] @@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) - opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, - init_program) + opts = adagrad_optimizer._create_optimization_pass( + params_grads, mul_out, init_program) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "adagrad"]) @@ -278,8 +278,8 @@ class TestAdamOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) - opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, - init_program) + opts = adam_optimizer._create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 5) self.assertEqual( [op.type for op in opts], @@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase): # Check accumulators accumulators = adam_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 2) + self.assertEqual(len(accumulators), 4) self.assertTrue(adam_optimizer.get_moment1_str() in accumulators) self.assertTrue(adam_optimizer.get_moment2_str() in accumulators) moment1_acc = accumulators[adam_optimizer.get_moment1_str()] @@ -345,8 +345,8 @@ class TestAdamaxOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) - opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, - init_program) + opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 4) self.assertEqual( [op.type for op in opts], @@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase): # Check accumulators accumulators = adamax_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 2) + self.assertEqual(len(accumulators), 3) self.assertTrue(adamax_optimizer.get_moment_str() in accumulators) self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators) moment_acc = accumulators[adamax_optimizer.get_moment_str()] @@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) - opts = decayed_adagrad_optimizer.create_optimization_pass( + opts = decayed_adagrad_optimizer._create_optimization_pass( params_grads, mul_out, init_program) self.assertEqual(len(opts), 3) self.assertEqual( @@ -475,8 +475,8 @@ class TestFtrlOptimizer(unittest.TestCase): params_grads = append_backward(mean_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) - opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out, - init_program) + opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 3) self.assertEqual([op.type for op in opts], ["fill_constant", "elementwise_mul", "ftrl"]) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index a801d99aa1ced35eb7f081fde63ad541f0eb2589..f098dc7a3fb670e23471c2aa897011a3cf882c33 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -101,9 +101,7 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) @@ -115,20 +113,19 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_simple_fc(self): - self.check_simple_fc_convergence(False, use_cuda=True) - self.check_simple_fc_convergence(False, use_cuda=False) + # use_cuda + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True, use_cuda=True) - self.check_simple_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_convergence(True, True) + self.check_simple_fc_convergence(False, True) - def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -145,8 +142,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) for p_f in parallel_first_loss: self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) @@ -154,15 +150,15 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False, use_cuda=True) - self.check_simple_fc_parallel_accuracy(False, use_cuda=False) + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True, use_cuda=True) - self.check_simple_fc_parallel_accuracy(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_parallel_accuracy(True, True) + self.check_simple_fc_parallel_accuracy(False, True) - def check_batchnorm_fc_convergence( - self, balance_parameter_opt_between_cards, use_cuda): + def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -171,16 +167,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False, use_cuda=True) - self.check_batchnorm_fc_convergence(False, use_cuda=False) + self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True, use_cuda=True) - self.check_batchnorm_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_batchnorm_fc_convergence(True, True) + self.check_batchnorm_fc_convergence(False, True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 066299e6c6f7f6c159cb0886e86d3404b027b698..4d39505b66abf44249e0ea160b82aaf7be0638cb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -13,8 +13,12 @@ # limitations under the License. import paddle.fluid as fluid +import paddle.fluid.layers.ops as ops +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter from parallel_executor_test_base import TestParallelExecutorBase import unittest +import math import os @@ -131,30 +135,71 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True, - iter=20): + def check_resnet_convergence_with_learning_rate_decay(self, + use_cuda=True, + use_reduce=False, + iter=20): + os.environ['CPU_NUM'] = str(4) + def _cosine_decay(learning_rate, step_each_epoch, epochs=120): + """ + Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr + + def _optimizer(learning_rate=0.01): + optimizer = fluid.optimizer.Momentum( + learning_rate=_cosine_decay( + learning_rate=learning_rate, step_each_epoch=2, epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + return optimizer + import functools + batch_size = 2 - self.check_network_convergence( + + single_first_loss, single_last_loss = self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), iter=iter, batch_size=batch_size, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_resnet(self): - self.check_resnet_convergence(False, use_cuda=True) - self.check_resnet_convergence(False, use_cuda=False, iter=5) + use_reduce=use_reduce, + optimizer=_optimizer, + use_parallel_executor=False) - def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True, use_cuda=True) - self.check_resnet_convergence(True, use_cuda=False, iter=5) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + functools.partial( + SE_ResNeXt50Small, batch_size=batch_size), + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=use_reduce, + optimizer=_optimizer) + + for p_f in parallel_first_loss: + self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) + for p_l in parallel_last_loss: + self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + + def test_seresnext_with_learning_rate_decay(self): + self.check_resnet_convergence_with_learning_rate_decay(True, False) + self.check_resnet_convergence_with_learning_rate_decay( + False, False, iter=5) + + def test_seresnext_with_new_strategy_with_learning_rate_decay(self): + self.check_resnet_convergence_with_learning_rate_decay(True, True) + self.check_resnet_convergence_with_learning_rate_decay( + False, True, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py index 9ba5f988f317a515b77c0b428da236626419a2c3..9ec05e02973138e3ec233ef07f98afd598ec86b1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py @@ -15,6 +15,7 @@ import unittest import paddle.fluid as fluid +from paddle.fluid.layers.device import get_places import paddle.fluid.profiler as profiler import numpy @@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase): if use_parallel: thread_num = fluid.core.get_cuda_device_count( ) if use_gpu else 8 - places = fluid.layers.get_places(thread_num) + places = get_places(thread_num) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) data = next(generator) diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py index bcbc02a2baa46b9ab583ecf3006bd3262e6038fd..e15554737b9f3fa36382dde15ded928271679538 100644 --- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py @@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest): 'variances': self.variances, 'flip': self.flip, 'clip': self.clip, + 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, 'step_w': self.step_w, 'step_h': self.step_h, 'offset': self.offset @@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest): max_sizes = [5, 10] self.max_sizes = np.array(max_sizes).astype('float32').tolist() + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = False + def init_test_params(self): self.layer_w = 32 self.layer_h = 32 @@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest): self.set_max_sizes() self.aspect_ratios = [2.0, 3.0] self.flip = True + self.set_min_max_aspect_ratios_order() self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.aspect_ratios = np.array( self.aspect_ratios, dtype=np.float).flatten() @@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest): self.variances = np.array(self.variances, dtype=np.float).flatten() self.clip = True - self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) if len(self.max_sizes) > 0: self.num_priors += len(self.max_sizes) @@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest): idx = 0 for s in range(len(self.min_sizes)): min_size = self.min_sizes[s] - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 - out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, - (c_y - c_h) / self.image_h, - (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h] - idx += 1 - - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 + if not self.min_max_aspect_ratios_order: + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + else: + c_w = c_h = min_size / 2. out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h] idx += 1 + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + if abs(ar - 1.) < 1e-6: + continue + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 # clip the prior's coordidate such that it is within[0, 1] if self.clip: @@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest): self.out_var = out_var.astype('float32') -class TestPriorBoxOpWithMaxSize(TestPriorBoxOp): +class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): def set_max_sizes(self): self.max_sizes = [] +class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 3f9059fb5b31cd009c068ccddc9a8938adae5772..f75a79bfa42405747df9e6f4f4ab743014e303b9 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -181,13 +181,13 @@ class TestBlockDesc(unittest.TestCase): self.assertIsNotNone(block) op1 = block.append_op() op2 = block.append_op() - op0 = block.prepend_op() + op0 = block._prepend_op() all_ops = [] for idx in xrange(0, block.op_size()): all_ops.append(block.op(idx)) self.assertEqual(all_ops, [op0, op1, op2]) - def test_remove_op(self): + def test__remove_op(self): program = Program() program_desc = program.desc self.assertIsNotNone(program_desc) @@ -201,8 +201,8 @@ class TestBlockDesc(unittest.TestCase): op1.set_type("test") op2.set_type("test") - block.remove_op(1, 2) - program.sync_with_cpp() + block._remove_op(1, 2) + program._sync_with_cpp() all_ops = [] for idx in xrange(0, block.op_size()): diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py new file mode 100644 index 0000000000000000000000000000000000000000..05715464848d835684dd3cf0e99e5d4dd482e0b6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py @@ -0,0 +1,99 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import numpy as np +from threading import Thread + + +def feed_data(feed_queue, inputs): + for in_data in inputs: + feed_queue.push(in_data) + + +class TestPyReader(unittest.TestCase): + def setUp(self): + self.capacity = 10 + self.batch_size_min = 10 + self.batch_size_max = 20 + self.shapes = [(-1, 3, 2, 1), (-1, 1)] + self.lod_levels = [0, 0] + self.dtypes = ['float32', 'int64'] + self.iterations = 20 + + def test_single_thread_main(self): + self.main(use_thread=False) + + def test_multiple_thread_main(self): + self.main(use_thread=True) + + def main(self, use_thread=False): + with fluid.program_guard(fluid.Program(), fluid.Program()): + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + executor = fluid.Executor(place) + + data_file, feed_queue = fluid.layers.py_reader( + capacity=self.capacity, + dtypes=self.dtypes, + lod_levels=self.lod_levels, + shapes=self.shapes) + + read_out_data = fluid.layers.read_file(data_file) + self.inputs = [] + + for i in range(self.iterations): + in_data = fluid.LoDTensorArray() + batch_size = np.random.random_integers(self.batch_size_min, + self.batch_size_max) + for shape, dtype in zip(self.shapes, self.dtypes): + next_data = np.random.uniform( + low=0, high=1000, + size=(batch_size, ) + shape[1:]).astype(dtype) + in_data.append(executor.as_lodtensor(next_data)) + + self.inputs.append(in_data) + + executor.run(fluid.default_startup_program()) + self.outputs = [] + if use_thread: + thread = Thread( + target=feed_data, args=(feed_queue, self.inputs)) + thread.start() + for in_data in self.inputs: + self.outputs.append( + executor.run(fetch_list=list(read_out_data))) + else: + for in_data in self.inputs: + feed_queue.push(in_data) + self.outputs.append( + executor.run(fetch_list=list(read_out_data))) + + feed_queue.close() + self.validate() + + def validate(self): + self.assertEqual(len(self.inputs), len(self.outputs)) + for in_data_list, out_data_list in zip(self.inputs, self.outputs): + self.assertEqual(len(in_data_list), len(out_data_list)) + in_data_list_np = [ + np.array(in_lod_tensor) for in_lod_tensor in in_data_list + ] + for in_data, out_data in zip(in_data_list_np, out_data_list): + self.assertTrue((in_data == out_data).all()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..9a5b69eea46e74deeba87aefae4afac84c7745f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -0,0 +1,224 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import threading +import multiprocessing +import os + + +def as_tensor(np_array_or_tensor, place=None): + if isinstance(np_array_or_tensor, fluid.LoDTensor): + return np_array_or_tensor + + if place is None: + place = fluid.CPUPlace() + + tensor = fluid.LoDTensor() + tensor.set(np_array_or_tensor, place) + return tensor + + +def as_numpy(tensor_or_numpy): + return tensor_or_numpy if isinstance( + tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy) + + +def feed_data(feed_queue, reader): + data_generator = reader() + while True: + data = next(data_generator, None) + if data is None or not feed_queue.push(data): + break + + +def simple_fc_net(in_size, + class_num, + hidden_sizes, + batch_size, + queue_capacity, + use_double_buffer=False): + reader, feed_queue = fluid.layers.py_reader( + capacity=queue_capacity, + shapes=[[-1, in_size], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + reader = fluid.layers.batch(reader, batch_size=batch_size) + if use_double_buffer: + reader = fluid.layers.double_buffer(reader) + + in_data, label = fluid.layers.read_file(reader) + + hidden = in_data + for hidden_size in hidden_sizes: + hidden = fluid.layers.fc( + hidden, + size=hidden_size, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax') + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=predict_label, label=label)) + + optimizer = fluid.optimizer.Adam() + optimizer.minimize(loss) + return in_data, label, loss, optimizer, feed_queue + + +class TestPyReaderUsingExecutor(unittest.TestCase): + def setUp(self): + self.in_size = 1000 + self.hidden_sizes = [50, 30, 20] + self.class_num = 10 + self.batch_size = 32 + self.iterations = 10 + self.queue_capacity = 50 + + def test(self): + for use_cuda in [False, True]: + for use_parallel_executor in [False, True]: + for use_double_buffer in [False, True]: + print('Test Parameters:'), + print({ + 'use_cuda': use_cuda, + 'use_parallel_executor': use_parallel_executor, + 'use_double_buffer': use_double_buffer + }) + self.main(use_cuda, use_parallel_executor, + use_double_buffer) + + def random_reader(self): + def reader(): + self.inputs = [] + cnt = 0 + while True: + tensors = fluid.LoDTensorArray() + in_data = np.random.uniform( + low=0, high=1, size=(1, self.in_size)).astype('float32') + tensors.append(as_tensor(in_data)) + label = np.random.random_integers( + low=0, high=self.class_num - 1, size=(1, 1)).astype('int64') + tensors.append(as_tensor(label)) + + if cnt < self.iterations * self.batch_size * self.batch_size_times: + if cnt % (self.batch_size * self.batch_size_times) == 0: + self.inputs.append([in_data, label]) + else: + self.inputs[-1][0] = np.concatenate( + (self.inputs[-1][0], in_data), axis=0) + self.inputs[-1][1] = np.concatenate( + (self.inputs[-1][1], label), axis=0) + elif not self.use_double_buffer: + break + + yield tensors + cnt += 1 + + yield None + + return reader + + def main(self, + use_cuda=True, + use_parallel_executor=False, + use_double_buffer=False): + assert not use_cuda or use_cuda and core.is_compiled_with_cuda() + + self.use_cuda = use_cuda + self.use_parallel_executor = use_parallel_executor + self.use_double_buffer = use_double_buffer + + startup_program = fluid.Program() + main_program = fluid.Program() + + with fluid.program_guard(main_program, startup_program): + in_data, label, loss, optimizer, feed_queue = simple_fc_net( + in_size=self.in_size, + class_num=self.class_num, + hidden_sizes=self.hidden_sizes, + batch_size=self.batch_size, + queue_capacity=self.queue_capacity, + use_double_buffer=self.use_double_buffer) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + startup_exe = fluid.Executor(place) + startup_exe.run(startup_program) + + if use_parallel_executor: + main_exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name) + if use_cuda: + self.batch_size_times = core.get_cuda_device_count() + else: + self.batch_size_times = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + else: + main_exe = startup_exe + self.batch_size_times = 1 + + reader = self.random_reader() + thread = threading.Thread( + target=feed_data, args=(feed_queue, reader)) + thread.start() + + self.outputs = [] + for _ in range(self.iterations): + fetches = main_exe.run(fetch_list=[in_data.name, label.name]) + fetches = [as_numpy(fetch) for fetch in fetches] + self.outputs.append(fetches) + + feed_queue.close() + self.validate() + + def validate(self): + self.assertEqual(len(self.inputs), len(self.outputs)) + for batch_in, batch_out in zip(self.inputs, self.outputs): + self.assertEqual(len(batch_in), len(batch_out)) + if self.use_parallel_executor and not self.use_double_buffer: + self.validate_unordered_batch(batch_in, batch_out) + else: + for in_data, out_data in zip(batch_in, batch_out): + self.assertEqual(in_data.shape, out_data.shape) + if not self.use_parallel_executor: + self.assertTrue((in_data == out_data).all()) + + def validate_unordered_batch(self, batch_in, batch_out): + out_index_left_set = set(range(self.batch_size * self.batch_size_times)) + mapping_num = 0 + for i in range(self.batch_size * self.batch_size_times): + for j in out_index_left_set: + flag = True + for k in range(len(batch_in)): + in_data = batch_in[k][i] + out_data = batch_out[k][j] + if (in_data != out_data).any(): + flag = False + break + + if flag: + out_index_left_set.remove(j) + mapping_num += 1 + break + + self.assertEqual(mapping_num, self.batch_size * self.batch_size_times) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py new file mode 100644 index 0000000000000000000000000000000000000000..d35183647ea57e378f0fe201ef03001122cb329f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -0,0 +1,116 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle.v2 as paddle +import numpy as np +import unittest + + +class TestReaderReset(unittest.TestCase): + def prepare_data(self): + def fake_data_generator(): + for n in xrange(self.total_ins_num): + yield np.ones(self.ins_shape) * n, n + + # Prepare data + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(fake_data_generator, batch_size=1) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name='data', shape=[3], dtype='float32'), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + self.data_file_name, reader, feeder) + + def setUp(self): + self.use_cuda = fluid.core.is_compiled_with_cuda() + self.data_file_name = './reader_reset_test.recordio' + self.ins_shape = [3] + self.batch_size = 5 + self.total_ins_num = self.batch_size * 20 + self.test_pass_num = 100 + self.prepare_data() + + def main(self, with_double_buffer): + main_prog = fluid.Program() + startup_prog = fluid.Program() + + with fluid.program_guard(main_prog, startup_prog): + data_reader_handle = fluid.layers.io.open_files( + filenames=[self.data_file_name], + shapes=[[-1] + self.ins_shape, [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64'], + thread_num=1, + pass_num=1) + data_reader = fluid.layers.io.batch(data_reader_handle, + self.batch_size) + if with_double_buffer: + data_reader = fluid.layers.double_buffer(data_reader) + image, label = fluid.layers.read_file(data_reader) + fetch_list = [image.name, label.name] + + place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + + build_strategy = fluid.BuildStrategy() + if with_double_buffer: + build_strategy.enable_data_balance = True + exec_strategy = fluid.ExecutionStrategy() + parallel_exe = fluid.ParallelExecutor( + use_cuda=self.use_cuda, + main_program=main_prog, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + data_appeared = [False] * self.total_ins_num + pass_count = 0 + while (True): + try: + data_val, label_val = parallel_exe.run(fetch_list, + return_numpy=True) + ins_num = data_val.shape[0] + broadcasted_label = np.ones((ins_num, ) + tuple( + self.ins_shape)) * label_val.reshape((ins_num, 1)) + self.assertEqual(data_val.all(), broadcasted_label.all()) + for l in label_val: + self.assertFalse(data_appeared[l[0]]) + data_appeared[l[0]] = True + + except fluid.core.EOFException: + pass_count += 1 + if with_double_buffer: + data_appeared = data_appeared[:-parallel_exe.device_count * + self.batch_size] + for i in data_appeared: + self.assertTrue(i) + if pass_count < self.test_pass_num: + data_appeared = [False] * self.total_ins_num + data_reader_handle.reset() + else: + break + + def test_all(self): + self.main(with_double_buffer=False) + self.main(with_double_buffer=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_selected_rows.py b/python/paddle/fluid/tests/unittests/test_selected_rows.py index 3d7b86787fbf0a855bcd86b8a873c9134cb1d5cc..f504a06ffff8cb636498652554fca05e22bb905d 100644 --- a/python/paddle/fluid/tests/unittests/test_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py @@ -40,12 +40,12 @@ class TestSelectedRows(unittest.TestCase): # compare tensor self.assertAlmostEqual(2.0, - selected_rows.get_tensor().get_float_element(0)) + selected_rows.get_tensor()._get_float_element(0)) self.assertAlmostEqual(1.0, - selected_rows.get_tensor().get_float_element(1)) + selected_rows.get_tensor()._get_float_element(1)) self.assertAlmostEqual( 4.0, - selected_rows.get_tensor().get_float_element(2 * row_numel + 8)) + selected_rows.get_tensor()._get_float_element(2 * row_numel + 8)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py index b779f0fb014bbba62927754ea6f36828a32e6c0a..24bc2cbaf86e8ed2c6a359c4c4d9a1e1507df746 100644 --- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py +++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py @@ -45,8 +45,8 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): def sum_lodtensor(self, tensor): sum_res = 0.0 - for i in xrange(np.product(tensor.get_dims())): - sum_res += tensor.get_float_element(i) + for i in xrange(np.product(tensor.shape())): + sum_res += tensor._get_float_element(i) return sum_res diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py new file mode 100644 index 0000000000000000000000000000000000000000..bca6af2fd5dfadbc48cf1a76cfa6ffd4f1fdfdef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py @@ -0,0 +1,114 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from op_test import OpTest + + +# Correct: General. +class TestSqueezeOp(OpTest): + def setUp(self): + self.op_type = "squeeze" + self.init_test_case() + self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + def init_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = (0, 2) + self.new_shape = (3, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": False} + + +# Correct: There is mins axis. +class TestSqueezeOp1(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = (0, -2) + self.new_shape = (3, 5) + + +# Correct: No axes input. +class TestSqueezeOp2(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = () + self.new_shape = (3, 5) + + +# Correct: Just part of axes be squeezed. +class TestSqueezeOp3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (3, 5, 1, 4) + + +# Correct: Inplace. +class TestSqueezeOpInplace1(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = (0, 2) + self.new_shape = (3, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +# Correct: Inplace. There is mins axis. +class TestSqueezeOpInplace2(TestSqueezeOp): + def inti_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = (0, -2) + self.new_shape = (3, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +# Correct: Inplace. No axes input. +class TestSqueezeOpInplace3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 5) + self.axes = () + self.new_shape = (3, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +# Correct: Inpalce. Just part of axes be squeezed. +class TestSqueezeOpInplace4(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (3, 5, 1, 4) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index f17edd3025b17549892bbd47935a1d2452cefac3..5ccc876ae8e6e20f76c77c1892f4de59d72bffc8 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -25,8 +25,8 @@ class TestTensor(unittest.TestCase): tensor = var.get_tensor() - tensor.set_dims([1000, 784]) - tensor.alloc_int(place) + tensor._set_dims([1000, 784]) + tensor._alloc_int(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1 @@ -44,8 +44,8 @@ class TestTensor(unittest.TestCase): tensor = var.get_tensor() - tensor.set_dims([1000, 784]) - tensor.alloc_float(place) + tensor._set_dims([1000, 784]) + tensor._alloc_float(place) tensor_array = numpy.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) @@ -63,8 +63,8 @@ class TestTensor(unittest.TestCase): var_lod = scope.var("test_lod_tensor") lod_tensor = var_lod.get_tensor() - lod_tensor.set_dims([4, 4, 6]) - lod_tensor.alloc_int(place) + lod_tensor._set_dims([4, 4, 6]) + lod_tensor._alloc_int(place) array = numpy.array(lod_tensor) array[0, 0, 0] = 3 array[3, 3, 5] = 10 @@ -84,8 +84,8 @@ class TestTensor(unittest.TestCase): var_lod = scope.var("test_lod_tensor") lod_tensor = var_lod.get_tensor() - lod_tensor.set_dims([5, 2, 3, 4]) - lod_tensor.alloc_float(place) + lod_tensor._set_dims([5, 2, 3, 4]) + lod_tensor._alloc_float(place) tensor_array = numpy.array(lod_tensor) self.assertEqual((5, 2, 3, 4), tensor_array.shape) @@ -104,14 +104,13 @@ class TestTensor(unittest.TestCase): self.assertListEqual(lod_py, lod) def test_lod_tensor_init(self): - scope = core.Scope() place = core.CPUPlace() lod_py = [[2, 1], [1, 2, 2]] lod_tensor = core.LoDTensor() - lod_tensor.set_dims([5, 2, 3, 4]) + lod_tensor._set_dims([5, 2, 3, 4]) lod_tensor.set_recursive_sequence_lengths(lod_py) - lod_tensor.alloc_float(place) + lod_tensor._alloc_float(place) tensor_array = numpy.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 1] = 2.0 @@ -129,9 +128,9 @@ class TestTensor(unittest.TestCase): lod_py = [[2, 1], [1, 2, 2]] lod_tensor = core.LoDTensor() - lod_tensor.set_dims([5, 2, 3, 4]) + lod_tensor._set_dims([5, 2, 3, 4]) lod_tensor.set_recursive_sequence_lengths(lod_py) - lod_tensor.alloc_float(place) + lod_tensor._alloc_float(place) tensor_array = numpy.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 1] = 2.0 @@ -149,15 +148,15 @@ class TestTensor(unittest.TestCase): tensor = var.get_tensor() - tensor.set_dims([0, 1]) - tensor.alloc_float(place) + tensor._set_dims([0, 1]) + tensor._alloc_float(place) tensor_array = numpy.array(tensor) self.assertEqual((0, 1), tensor_array.shape) if core.is_compiled_with_cuda(): gpu_place = core.CUDAPlace(0) - tensor.alloc_float(gpu_place) + tensor._alloc_float(gpu_place) tensor_array = numpy.array(tensor) self.assertEqual((0, 1), tensor_array.shape) diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4aa0a40b5eb494f6027e800ca6b466bbe1c302 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -0,0 +1,111 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from op_test import OpTest + + +# Correct: General. +class TestUnsqueezeOp(OpTest): + def setUp(self): + self.init_test_case() + self.op_type = "unsqueeze" + self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + def init_test_case(self): + self.ori_shape = (3, 5) + self.axes = (1, 2) + self.new_shape = (3, 1, 1, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": False} + + +# Correct: Single input index. +class TestUnsqueezeOp1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 5) + self.axes = (-1, ) + self.new_shape = (3, 5, 1) + + +# Correct: Mixed input axis. +class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 5) + self.axes = (0, -1) + self.new_shape = (1, 3, 5, 1) + + +# Correct: There is duplicated axis. +class TestUnsqueezeOp3(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 3, 2, 1, 1, 5) + + +# Correct: Reversed axes. +class TestUnsqueezeOp4(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (3, 1, 1, 2, 5, 1) + + +# Correct: Inplace. +class TestUnsqueezeOpInplace1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 5) + self.axes = (0, 2) + self.new_shape = (1, 3, 1, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +# Correct: Inplace. There is mins index. +class TestUnsqueezeOpInplace2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 5) + self.axes = (0, -2) + self.new_shape = (1, 3, 1, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +# Correct: Inplace. There is duplicated axis. +class TestUnsqueezeOpInplace3(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 3, 2, 1, 1, 5) + + def init_attrs(self): + self.attrs = {"axes": self.axes, "inplace": True} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index a995ee10f29a714b674fae4b31070e6ba2ca9953..55c6e54906e739ef0bc953fa5c9e9641ec575ccf 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -75,7 +75,7 @@ def set_input(scope, op, inputs, place): if isinstance(var, tuple): tensor.set_recursive_sequence_lengths(var[1]) var = var[0] - tensor.set_dims(var.shape) + tensor._set_dims(var.shape) tensor.set(var, place) elif isinstance(var, float): scope.find_var(var_name).set_float(var) diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index b6e0241265b18377874efb0d223441994b4650d0..64049a93cb0a267722de9cd94961b6256551330d 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -14,6 +14,9 @@ import contextlib import os +import errno +import shutil +import time import core @@ -94,7 +97,7 @@ class EndStepEvent(object): class CheckpointConfig(object): """ - Parameter object for :code:`fluid.io.save_checkpoint` and + Parameter object for :code:`save_checkpoint` and :code:`fluid.Trainer`. Used to configuration how to save checkpoint. Args: @@ -237,7 +240,7 @@ class Trainer(object): self.checkpoint_cfg = checkpoint_config if self.checkpoint_cfg: assert isinstance(self.checkpoint_cfg, CheckpointConfig) - serial = io.get_latest_checkpoint_serial( + serial = _get_latest_checkpoint_serial( self.checkpoint_cfg.checkpoint_dir) self.checkpoint_cfg.load_serial = serial if serial >= 0 else None @@ -276,32 +279,15 @@ class Trainer(object): exe = executor.Executor(place) exe.run(self.startup_program) - if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: - with self._prog_and_scope_guard(): - exe = executor.Executor(place) - io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir, - self.checkpoint_cfg.load_serial, - self.startup_program) - - if not self.checkpoint_cfg.pserver_id: - epoch_id, step_id = io.load_trainer_args( - self.checkpoint_cfg.checkpoint_dir, - self.checkpoint_cfg.load_serial, self.trainer_id, - self._get_checkpoint_load_args()) - self.checkpoint_cfg.epoch_id = int(epoch_id) - self.checkpoint_cfg.step_id = int(step_id) - else: - if self.checkpoint_cfg.lookup_table_name: - io.load_lookup_table_vars( - exe, self.checkpoint_cfg.checkpoint_dir, - self.startup_program, - self.checkpoint_cfg.pserver_id, - self.checkpoint_cfg.lookup_table_name) + if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None: + self._load_checkpoint() if param_path and os.path.isdir(param_path): # load params from param_path into scope - io.load_persist_vars_without_grad( - exe, dirname=param_path, program=self.startup_program) + io.load_persistables( + executor=exe, + dirname=param_path, + main_program=self.startup_program) def _transpile_nccl2_dist(self): # PADDLE_TRAINER_IPS @@ -549,7 +535,7 @@ class Trainer(object): def _clean_checkpoint(self): assert self.checkpoint_cfg - io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) + clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) def _get_checkpoint_load_args(self): """ @@ -572,7 +558,7 @@ class Trainer(object): if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \ and step_id % self.checkpoint_cfg.step_interval == 0: exe = executor.Executor(self.place) - io.save_checkpoint( + save_checkpoint( executor=exe, checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, trainer_id=self.trainer_id, @@ -580,6 +566,41 @@ class Trainer(object): main_program=self.train_program, max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) + def _load_checkpoint(self): + with self._prog_and_scope_guard(): + exe = executor.Executor(self.place) + load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program) + + if not self.checkpoint_cfg.pserver_id: + load_trainer_args = self._get_checkpoint_load_args() + trainer_args = load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program, + role_id=self.trainer_id, + is_trainer=True, + load_trainer_args=load_trainer_args) + + if len(trainer_args) != 2: + raise ValueError( + "the return trainer_args length do not equal _get_checkpoint_load_args" + ) + self.checkpoint_cfg.epoch_id = int(trainer_args[0]) + self.checkpoint_cfg.step_id = int(trainer_args[1]) + else: + if self.checkpoint_cfg.lookup_table_name: + load_checkpoint( + executor=exe, + checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, + main_program=self.startup_program, + role_id=self.checkpoint_cfg.pserver_id, + is_trainer=False, + load_trainer_args=None, + load_lookup_table=self.checkpoint_cfg.lookup_table_name) + def build_feed_var_list(program, feed_order): if not isinstance(program, framework.Program): @@ -602,3 +623,610 @@ def build_feed_var_list(program, feed_order): program.global_block().var(pair[0]) for pair in sorted_pair_list ] return feed_var_list + + +# move Checkpoint APIs from io.py to trainer.py, make all of them are private. +SUCCESS_MARK_FILENAME = "_SUCCESS" +CHECKPOINT_PREFIX = "checkpoint" +MODEL_DIR = "__model__" +LOOKUP_TABLE_DIR = "__lookup_table__" +TRAINER_PREFIX = "trainer" +CHECKPOINT_SEPARATOR = "_" + + +def save_checkpoint(executor, + checkpoint_dir, + trainer_id, + main_program, + trainer_args=None, + max_num_checkpoints=3, + lookup_table=None, + pserver_endpoints=None): + """ + This function filters out all checkpoint variables from the give + main_program and then saves these variables to the `checkpoint_dir` + directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there might be a lot of checkpoints in the + `checkpoint_dir`. To avoid them taking too much disk space, the + `max_num_checkpoints` are introduced to limit the total number of + checkpoints. If the number of existing checkpints is greater than + the `max_num_checkpoints`, oldest ones will be scroll deleted. + + A variable is a checkpoint variable and will be saved if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for save checkpoint. + checkpoint_dir(str): The folder where to save checkpoints. + trainer_id(int): currect trainer id, if id is equal to 0, the trainer + is chief. + trainer_args(dict|None): Current training arguments. Such as 'epoch_id' + and 'step_id'. + Defaut: None + main_program(Program): The program whose checkpoint variables will + be saved. + max_num_checkpoints(int): The max number of total number of existing + checkpoints. + Default: 3 + lookup_table(string|None): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + pserver_endpoints(list|None): the parameter server ip:port list. + when use distribute lookup table, we can get pserver_endpoints by + distribute arguments. + + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + AssertionError: If `trainer_args` is not a dict. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + trainer_args = {"epoch_id": 200, + "step_id": 20} # just an example + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + save_checkpoint(executor=exe, + checkpoint_dir=path, + trainer_id=0, + trainer_args=trainer_args, + main_program=prog, + max_num_checkpoints=3, + lookup_table=table_name, + pserver_endpoints = ps_endpoints) + """ + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + + if main_program is None: + raise ValueError('main_program should not be None.') + + if trainer_args: + assert isinstance(trainer_args, dict) + + is_chief = trainer_id == 0 + + _make_chekcpoint_dirs(checkpoint_dir) + serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1 + cur_dir = _get_serial_dir(checkpoint_dir, serial) + + _save_trainer_args(cur_dir, trainer_id, trainer_args) + + if is_chief: + _save_persist_vars_without_grad(executor, cur_dir, main_program) + + if is_chief and lookup_table and pserver_endpoints: + _save_pserver_vars_by_notify(executor, cur_dir, lookup_table, + pserver_endpoints) + + _scroll_delete(checkpoint_dir, max_num_checkpoints) + + +def load_checkpoint(executor, + checkpoint_dir, + main_program, + role_id=0, + is_trainer=True, + load_trainer_args=None, + load_lookup_table=None): + """ + This function filters out all checkpoint variables from the give + main_program and then try to load these variables from the + `checkpoint_dir` directory. + + In the training precess, we generally save a checkpoint in each + iteration. So there are more than one checkpoint in the + `checkpoint_dir` (each checkpoint has its own sub folder), use + `serial` to specify which serial of checkpoint you would like to + load. + + A variable is a checkpoint variable and will be loaded if it meets + all following conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading checkpoint. + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + main_program(Program): The program whose checkpoint variables will + be loaded. + role_id(int): the trainer id or the parameter server id. + is_trainer(bool): trainer is True and parameter server is False. + load_trainer_args(list|None): list about load trainer args. + load_lookup_table(str|None): the lookup table name + + Returns: + None + + Raises: + ValueError: If `checkpoint_dir` is None. + ValueError: If `main_program` is None. + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + path = "./checkpoints" + prog = fluid.default_main_program() + load_checkpoint(executor=exe, checkpoint_dir=path, + serial=9, main_program=prog) + + # In this example, `load_checkpoint` function + # will first filters out all checkpoint variables in the default + # main program, and then try to load these variables form the + # folder "./checkpoints/checkpoint_9/__model__". + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + + serial = _get_latest_checkpoint_serial(checkpoint_dir) + + # there are nothing need to be loaded + if serial is None or serial < 0: + return + + if main_program is None: + raise ValueError('main_program should not be None.') + + if is_trainer and load_trainer_args is None: + cur_dir = _get_serial_dir(checkpoint_dir, serial) + _load_persist_vars_without_grad(executor, cur_dir, main_program, True) + return + + if is_trainer and load_trainer_args: + return _load_trainer_args(checkpoint_dir, serial, role_id, + load_trainer_args) + + if not is_trainer and load_lookup_table: + _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id, + load_lookup_table) + + +def clean_checkpoint(checkpoint_dir, delete_dir=False): + """ + clean the checkpoint dir, when the train exits normally, + the trainer will call clean_checkpoint to delete checkpoint directory saved before. + delete_dir only works when the directory is empty, otherwise, OSError is raised. + + : param checkpoint_dir + : param delete_dir + """ + + if checkpoint_dir is None: + raise ValueError("'checkpoint_dir' should not be None") + _scroll_delete(checkpoint_dir, max_num_checkpoints=0) + + if delete_dir and not os.listdir(checkpoint_dir): + os.rmdir(checkpoint_dir) + + +def _load_persist_vars_without_grad(executor, + dirname, + program, + has_model_dir=False): + """ + This function filters out all checkpoint variables from the give + program and then trys to load these variables from the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for loading variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be loaded. + has_model_dir(bool): if True, the function loads variables + from a sub directory named '__model__'. + Default: False + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + _load_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog, has_model_dir=True) + + # In this example, `_load_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then trys to load these variables form the + # folder "./my_paddle_model/__model__". + """ + + if has_model_dir: + dirname = _get_model_dir(dirname) + + io.load_vars( + executor, + dirname=dirname, + main_program=program, + predicate=_is_checkpoint_var, + filename=None) + + +def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name): + """ + The parameter server will load lookup table's local file in + selectedrows variable. + + Args: + executor(Executor): The executor to run for loading persistable variables + dirname(str): The directory path + main_program(Program): Find the variable named table_name in main_program + pserver_id(int): the serial number in pserver_endpoints list + table_name(str): lookup table name + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + dirname = "./checkpoints/checkpoint_9/" + prog = fluid.default_main_program() + pserver_id = 1 + table_name = "share_w" + _load_lookup_table_vars(executor=exe, + dirname=dirname, program=prog, pserver_id=pserver_id, + table_name=table_name) + """ + + for var in program.list_vars(): + if var.name == table_name: + lookup_table_var = var + break + + assert lookup_table_var is not None + + lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id) + + load_prog = framework.Program() + load_block = load_prog.global_block() + + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [lookup_table_var]}, + attrs={'file_path': os.path.join(lookup_table_dir, table_file)}) + + executor.run(load_prog) + + +def _save_persist_vars_without_grad(executor, dirname, program): + """ + This function filters out all checkpoint variables from the give + program and then save these variables to a sub-folder '__model__' of + the given directory. + + A variable is a checkpoint variable if it meets all following + conditions: + 1. It's persistable. + 2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW. + 3. It's name contains no "@GRAD" nor ".trainer_" nor ".block". + + Args: + executor(Executor): The executor to run for saving variables. + dirname(str): The directory path. + program(Program): The program whose checkpoint variables will + be saved. + + Returns: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + _save_persist_vars_without_grad(executor=exe, + dirname=param_path, program=prog) + + # In this example, `_save_persist_vars_without_grad` function + # will first filters out all checkpoint variables in the default + # main program, and then saves these variables to the folder + # "./my_paddle_model/__model__". + """ + cur_dir = _get_model_dir(dirname) + io.save_vars( + executor, + dirname=cur_dir, + main_program=program, + vars=None, + predicate=_is_checkpoint_var, + filename=None) + _write_success(cur_dir) + + +def _save_pserver_vars_by_notify(executor, dirname, lookup_table, + ps_endpoint_list): + """ + This function will send checkpoint notify message from Trainer 0 + to all the pservers. + The checkpoint notify message contains lookup table name, + the absolute path on pserver to save lookup_table. + + Args: + executor(Executor): The executor to run for send checkpoint notify. + dirname(str): The folder where to save checkpoints. + lookup_table(string): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + ps_endpoint_list(list): the parameter server ip:port list. + when use distribute lookup table, we can get ps_endpoint_list by + distribute arguments. + Return: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + prog = fluid.default_main_program() + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + _save_pserver_vars_by_notify(executor=exe, + dirname=param_path, lookup_table=table_name, + ps_endpoint_list=ps_endpoints) + """ + cur_dir = _get_lookuptable_dir(dirname) + + checkpoint_notify_program = framework.Program() + checkpoint_notify_block = checkpoint_notify_program.global_block() + + attrs = {} + attrs['epmap'] = ps_endpoint_list + attrs['dir'] = cur_dir + attrs['lookup_table'] = lookup_table + + checkpoint_notify_block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(checkpoint_notify_program) + + +def _save_trainer_args(dirname, trainer_id, trainer_args): + assert isinstance(trainer_args, dict) + + cur_dir = _get_trainer_dir(dirname, trainer_id) + + for name, value in trainer_args.iteritems(): + args_file = os.path.join(cur_dir, name) + with open(args_file, 'w') as f: + f.write(str(value)) + _write_success(cur_dir) + + +def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): + """ + trainer will load some args from it's independent directory, + such as epoch_id and step_id. + + Args: + checkpoint_dir(str): The folder where all checkpoints are. + serial(int): The serial of checkpoint you would like to load. + trainer_id(int): current trainer id. + trainer_args(list): list about load trainer args + Return: + None + + Examples: + .. code-block:: python + + param_path = "./checkpoint/" + serial = 7 + trainer_id = 2 + trainer_args = ["epoch_id", "step_id"] + + _load_trainer_args(checkpoint_dir=param_path, serial=serial, + trainer_id=trainer_id, trainer_args=trainer_args) + """ + assert isinstance(trainer_args, list) + + cur_dir = _get_serial_dir(checkpoint_dir, serial) + cur_dir = _get_trainer_dir(cur_dir, trainer_id) + + ret_values = [] + + for arg in trainer_args: + cur_file = os.path.join(cur_dir, arg) + with open(cur_file, 'r') as f: + contents = f.read() + ret_values.append(contents.strip()) + return ret_values + + +def _is_checkpoint_var(var): + """ + the checkpoint will not save or load all the variables. + var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. + + : param var(Variable) + """ + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.RAW: + return False + # @GRAD are named for gradient variables, checkpoint will not save it. + if "@GRAD" in var.name: + return False + # .trainer_ are named for distribute train variables, checkpoint will not save it. + if ".trainer_" in var.name: + return False + + # .block is named for distribute train variables, checkpoint will not save it. + if ".block" in var.name: + return False + + return var.persistable + + +def _make_chekcpoint_dirs(dirs): + """ + _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it. + """ + assert dirs is not None + + if os.path.isfile(dirs): + raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs) + + if not os.path.isdir(dirs): + try: + os.makedirs(dirs) + except OSError as err: + if err.errno != errno.EEXIST: + raise err + + +def _get_dir_serial(dirname): + _, serial = dirname.split(CHECKPOINT_SEPARATOR) + + try: + serial_num = int(serial) + except ValueError: + serial_num = -1 + return serial_num + + +def _get_serial_dir(dirname, serial): + serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) + serial_dir = os.path.join(dirname, serial_folder) + _make_chekcpoint_dirs(serial_dir) + + return serial_dir + + +def _get_model_dir(dirname): + model_dir = os.path.join(dirname, MODEL_DIR) + _make_chekcpoint_dirs(model_dir) + return model_dir + + +def _get_lookuptable_dir(dirname): + lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR) + _make_chekcpoint_dirs(lookuptable_dir) + return lookuptable_dir + + +def _get_trainer_dir(dirname, trainer_id): + trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) + trainer_dir = os.path.join(dirname, trainer_folder) + _make_chekcpoint_dirs(trainer_dir) + return trainer_dir + + +def _scroll_delete(dirname, max_num_checkpoints=3): + dirs = os.listdir(dirname) + serial_map = {} + for serial in dirs: + serial_num = _get_dir_serial(serial) + serial_map[serial_num] = serial + + if len(serial_map.keys()) <= max_num_checkpoints: + return + + serials = serial_map.keys() + serials.sort(reverse=True) + serials = serials[max_num_checkpoints:] + for serial in serials: + cur_dir = _get_serial_dir(dirname, serial) + try: + shutil.rmtree(cur_dir) + except OSError as err: + if err.errno != errno.ENOENT: + raise err + + +def _write_success(dirname): + """ + write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct. + + : param dirname + """ + success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME) + with open(success_file, 'a') as f: + now = time.ctime() + f.write(now) + + +def _get_latest_checkpoint_serial(checkpoint_dir): + """ + get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory + + : param checkpoint_dir + """ + if not checkpoint_dir: + return -1 + + def has_success(checkpoint_dir, cur_dir): + """ + is _SUCCESS in this dir + """ + + serial = _get_dir_serial(cur_dir) + if serial == -1 or not os.path.isdir( + os.path.join(checkpoint_dir, cur_dir)): + return -1 + + success_path = os.path.join( + _get_serial_dir(checkpoint_dir, serial), MODEL_DIR, + SUCCESS_MARK_FILENAME) + if os.path.isfile(success_path): + return serial + + if not os.path.isdir(checkpoint_dir): + return -1 + + current_dir = -1 + dirs = os.listdir(checkpoint_dir) + for cur_dir in dirs: + success_num = has_success(checkpoint_dir, cur_dir) + if success_num > current_dir: + current_dir = success_num + return current_dir diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index cf18090f71f34be5105498f5846dbcdf15ab2e3f..eae13b50398f791d4a203b72a0e96f3e87cc2a88 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from distribute_transpiler import DistributeTranspiler +from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig from inference_transpiler import InferenceTranspiler from memory_optimization_transpiler import memory_optimize, release_memory from ps_dispatcher import HashName, RoundRobin __all__ = [ "DistributeTranspiler", "InferenceTranspiler", "memory_optimize", - "release_memory", "HashName", "RoundRobin" + "release_memory", "HashName", "RoundRobin", "DistributeTranspilerConfig" ] diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index f10b496306a002ee131d01798a0698b807d379ca..2ca1d4716b103d17117ae3ee958667c3a9747cdf 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -17,10 +17,10 @@ def delete_ops(block, ops): try: start = list(block.ops).index(ops[0]) end = list(block.ops).index(ops[-1]) - [block.remove_op(start) for _ in xrange(end - start + 1)] + [block._remove_op(start) for _ in xrange(end - start + 1)] except Exception, e: raise e - block.program.sync_with_cpp() + block.program._sync_with_cpp() def find_op_by_input_arg(block, arg_name): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 53d6ca86a008f798af2854a154cce8b7242d2f35..c2044bf03135dd9c5256021d87866cfbbc598dad 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -31,6 +31,7 @@ Steps to transpile pserver: from __future__ import print_function import math +import random import numpy as np from ps_dispatcher import RoundRobin, HashName, PSDispatcher @@ -63,7 +64,7 @@ def same_or_split_var(p_name, var_name): return p_name == var_name or p_name.startswith(var_name + ".block") -def slice_variable(var_list, slice_count, min_block_size=8192): +def slice_variable(var_list, slice_count, min_block_size): """ We may need to split dense tensor to one or more blocks and put them equally onto parameter server. One block is a sub-tensor @@ -109,6 +110,22 @@ def slice_variable(var_list, slice_count, min_block_size=8192): return blocks +class DistributeTranspilerConfig(object): + """ + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + min_block_size (int): Minimum splitted element number in block. + According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + We can use bandwidth effiently when data size is larger than 2MB.If you + want to change it, please be sure you see the slice_variable function. + """ + + slice_var_up = True + split_method = None + min_block_size = 8192 + + class DistributeTranspiler(object): """ **DistributeTranspiler** @@ -145,13 +162,23 @@ class DistributeTranspiler(object): trainer_program = t.get_trainer_program() """ + def __init__(self, config=None): + if config is not None: + self.config = config + else: + self.config = DistributeTranspilerConfig() + + if self.config.split_method is None: + self.config.split_method = RoundRobin + + assert (self.config.min_block_size >= 8192) + assert (self.config.split_method.__bases__[0] == PSDispatcher) + def transpile(self, trainer_id, program=None, pservers="127.0.0.1:6174", trainers=1, - slice_var_up=True, - split_method=RoundRobin, sync_mode=True): """ Run the transpiler. @@ -164,12 +191,8 @@ class DistributeTranspiler(object): pservers (str): comma separated ip:port string for the pserver list. trainers (int): number of trainers in the distributed job. - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. sync_mode (bool): Do sync training or not, default is True. """ - assert (split_method.__bases__[0] == PSDispatcher) if program is None: program = default_main_program() self.origin_program = program @@ -180,11 +203,11 @@ class DistributeTranspiler(object): self.pserver_endpoints = pserver_endpoints self.optimize_ops, self.params_grads = self._get_optimize_pass() - ps_dispatcher = split_method(self.pserver_endpoints) + ps_dispatcher = self.config.split_method(self.pserver_endpoints) self.has_distributed_lookup_table = self._has_distributed_lookup_table() # split and create vars, then put splited vars in dicts for later use. - self._init_splited_vars(slice_var_up) + self._init_splited_vars() # step 3.1: insert send op to send gradient vars to parameter servers ps_dispatcher.reset() @@ -196,13 +219,14 @@ class DistributeTranspiler(object): # fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2 # shuffle the map will avoid the uneven distribution above grad_var_mapping_items = self.grad_var_mapping.items() - if not slice_var_up: - np.random.shuffle(grad_var_mapping_items) + if not self.config.slice_var_up: + random.seed(self.trainer_num) + random.shuffle(grad_var_mapping_items) for orig_varname, splited_vars in grad_var_mapping_items: eplist = ps_dispatcher.dispatch(splited_vars) - if not slice_var_up: + if not self.config.slice_var_up: assert (len(splited_vars) == 1) if len(splited_vars) == 1: @@ -219,7 +243,7 @@ class DistributeTranspiler(object): AssertionError("Can not insert the send op by original " "variable name :", orig_varname) - program.global_block().insert_op( + program.global_block()._insert_op( index=index + 1, type="send", inputs={"X": splited_vars}, @@ -377,11 +401,6 @@ class DistributeTranspiler(object): # append it into the sub program. global_ops = [] - # HACK: optimization global ops only used to scale beta1 and beta2 - # replace it with dependency engine. - for op in self.optimize_ops: - if self._is_adam_connected_op(op): - global_ops.append(op) def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): @@ -410,7 +429,7 @@ class DistributeTranspiler(object): # clone vars for var in origin_block.vars: - new_sub_block.clone_variable(var) + new_sub_block._clone_variable(var) # clone ops for origin_op in origin_block.ops: @@ -442,6 +461,8 @@ class DistributeTranspiler(object): per_opt_block = pserver_program.create_block(pre_block_idx) optimize_blocks.append(per_opt_block) # append grad merging ops before clip and weight decay + # cases may like: + # L2Decay op -> clip op -> optimize for _, op in enumerate(self.optimize_ops): # find the origin @GRAD var before clipping grad_varname_for_block = __op_have_grad_input__(op) @@ -449,6 +470,7 @@ class DistributeTranspiler(object): merged_var = self._append_pserver_grad_merge_ops( per_opt_block, grad_varname_for_block, endpoint, grad_to_block_id, self.origin_program) + break # append optimize op once then append other ops. for _, op in enumerate(self.optimize_ops): # optimizer is connected to itself if ufind.is_connected(op, opt_op) and op not in global_ops: @@ -503,7 +525,7 @@ class DistributeTranspiler(object): outputs={}, attrs=attrs) - pserver_program.sync_with_cpp() + pserver_program._sync_with_cpp() return pserver_program def get_startup_program(self, endpoint, pserver_program): @@ -535,7 +557,7 @@ class DistributeTranspiler(object): pserver_vars = pserver_program.global_block().vars created_var_map = dict() for _, var in pserver_vars.iteritems(): - tmpvar = s_prog.global_block().clone_variable(var) + tmpvar = s_prog.global_block()._clone_variable(var) created_var_map[var.name] = tmpvar # 2. rename op outputs @@ -630,7 +652,7 @@ class DistributeTranspiler(object): ] return param_list, grad_list - def _init_splited_vars(self, slice_var_up): + def _init_splited_vars(self): # update these mappings for further transpile: # 1. param_var_mapping: param var name -> [splited params vars] # 2. grad_var_mapping: grad var name -> [splited grads vars] @@ -654,17 +676,22 @@ class DistributeTranspiler(object): param_list, grad_list = self._update_dist_lookup_table_vars( param_list, grad_list, self.params_grads) - if slice_var_up: + if self.config.slice_var_up: # when we slice var up into blocks, we will slice the var according to # pserver services' count. A pserver may have two or more listening ports. - grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) + grad_blocks = slice_variable(grad_list, + len(self.pserver_endpoints), + self.config.min_block_size) param_blocks = slice_variable(param_list, - len(self.pserver_endpoints)) + len(self.pserver_endpoints), + self.config.min_block_size) else: # when we do NOT slice var up into blocks, we will always slice params # grads into one block. - grad_blocks = slice_variable(grad_list, 1) - param_blocks = slice_variable(param_list, 1) + grad_blocks = slice_variable(grad_list, 1, + self.config.min_block_size) + param_blocks = slice_variable(param_list, 1, + self.config.min_block_size) assert (len(grad_blocks) == len(param_blocks)) # origin_varname -> [splited_var] @@ -733,7 +760,7 @@ class DistributeTranspiler(object): self.all_prefetch_output_vars.append(prefetch_output_vars) # insert split_ids_op - program.global_block().insert_op( + program.global_block()._insert_op( index=lookup_table_op_index, type="split_ids", inputs={ @@ -745,7 +772,7 @@ class DistributeTranspiler(object): outputs={"Out": prefetch_input_vars}) # insert prefetch_op - program.global_block().insert_op( + program.global_block()._insert_op( index=lookup_table_op_index + 1, type="prefetch", inputs={'X': prefetch_input_vars}, @@ -756,7 +783,7 @@ class DistributeTranspiler(object): }) # insert concat_op - program.global_block().insert_op( + program.global_block()._insert_op( index=lookup_table_op_index + 2, type="merge_ids", inputs={ @@ -787,14 +814,14 @@ class DistributeTranspiler(object): if table_grad_name in op.output_arg_names: op_index = list(all_ops).index(op) # insert split_ids_op - program.global_block().insert_op( + program.global_block()._insert_op( index=op_index + 1, type="split_ids", inputs={ 'Ids': [program.global_block().vars[table_grad_name]] }, outputs={"Out": self.trainer_side_table_grad_list}) - program.global_block().insert_op( + program.global_block()._insert_op( index=op_index + 2, type="send", inputs={'X': self.trainer_side_table_grad_list}, @@ -853,7 +880,7 @@ class DistributeTranspiler(object): persistable=True) # parameter must be selected rows param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) - grad_var = pserver_program.global_block().clone_variable( + grad_var = pserver_program.global_block()._clone_variable( self.origin_program.global_block().vars[grad_var_name( self.table_name)]) @@ -893,7 +920,7 @@ class DistributeTranspiler(object): if not splited_grad_name.startswith(origin_grad_name): raise ValueError("origin_grad_var: " + splited_grad_name + " grad_var:" + grad_var.name) - grad_var = pserver_program.global_block().rename_var( + grad_var = pserver_program.global_block()._rename_var( origin_grad_name, splited_grad_name) lr_var = pserver_program.global_block().vars[table_opt_op.input( @@ -969,7 +996,7 @@ class DistributeTranspiler(object): if self.sync_mode and add_trainer_suffix: new_var_name = "%s.trainer_%d" % \ (orig_var.name, self.trainer_id) - program.global_block().rename_var(varname, new_var_name) + program.global_block()._rename_var(varname, new_var_name) var_mapping[varname] = \ [program.global_block().var(new_var_name)] else: @@ -1003,7 +1030,7 @@ class DistributeTranspiler(object): type=orig_var.type, shape=splited_shape) # flattend splited var var_mapping[varname].append(var) - program.global_block().sync_with_cpp() + program.global_block()._sync_with_cpp() return var_mapping def create_splited_vars(self, source_var, block, tag): @@ -1031,7 +1058,7 @@ class DistributeTranspiler(object): height_sections = [] for v in splited_vars: height_sections.append(v.shape[0]) - program.global_block().insert_op( + program.global_block()._insert_op( index=index + 1, type="split_selected_rows", inputs={"X": orig_var}, @@ -1041,7 +1068,7 @@ class DistributeTranspiler(object): sections = [] for v in splited_vars: sections.append(v.shape[0]) - program.global_block().insert_op( + program.global_block()._insert_op( index=index + 1, type="split_byref", inputs={"X": orig_var}, @@ -1230,7 +1257,7 @@ class DistributeTranspiler(object): varlist = [varlist] for var in varlist: if var not in program.global_block().vars: - block.clone_variable(var) + block._clone_variable(var) outputs = self._get_output_map_from_op( self.origin_program.global_block().vars, op) @@ -1239,7 +1266,7 @@ class DistributeTranspiler(object): varlist = [varlist] for var in varlist: if var not in program.global_block().vars: - block.clone_variable(var) + block._clone_variable(var) return block.append_op( type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) @@ -1277,7 +1304,7 @@ class DistributeTranspiler(object): if grad_block: outputs[key] = grad_block elif not program.global_block().vars.has_key(var.name): - program.global_block().clone_variable(var) + program.global_block()._clone_variable(var) return optimize_block.append_op( type=opt_op.type, @@ -1289,26 +1316,8 @@ class DistributeTranspiler(object): # If one op's input is another op's output or # one op's output is another op's input, we say # the two operator is connected. - def _append_inname_remove_beta(varname_list): - op_input_names = [] - for in_name in varname_list: - # HACK: remove beta1 and beta2 to avoid let all - # ops connected. - if in_name.startswith("beta2_pow_acc") or \ - in_name.startswith("beta1_pow_acc"): - continue - else: - op_input_names.append(in_name) - return op_input_names - - op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names()) - op1_output_names = op1.desc.output_arg_names() - - op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names()) - op2_output_names = op2.desc.output_arg_names() - - if set(op1_output_names) & set(op2_input_names) or \ - set(op1_input_names) & set(op2_output_names): + if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ + set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): return True return False @@ -1413,7 +1422,7 @@ class DistributeTranspiler(object): def _get_optimize_pass(self): """ - Get optimizer operators, paramters and gradients from origin_program + Get optimizer operators, parameters and gradients from origin_program Returns: opt_ops (list): optimize operators. params_grads (dict): paramter->gradient. @@ -1436,20 +1445,6 @@ class DistributeTranspiler(object): origin_var_dict[param_name], origin_var_dict[input_name] ]) - elif self._is_adam_connected_op(op): - opt_ops.append(op) else: pass return opt_ops, params_grads - - def _is_adam_connected_op(self, op): - """ - A hack function to determinate whether the input operator - is connected to optimize operator. - """ - if op.type == "scale": - for in_name in op.input_arg_names: - if in_name.startswith("beta1_pow_acc") or \ - in_name.startswith("beta2_pow_acc"): - return True - return False diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index b8afeae5ebd6ef7948a7c0c2775f419af461da04..f1905f08787da7a58a41d840ea68fb6c07f4028f 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -95,7 +95,7 @@ class InferenceTranspiler(object): # modify bnorm OP to include relu current_op.set_attr("fuse_with_relu", True) # remove relu OP - self.block.remove_op(i + 1) + self.block._remove_op(i + 1) i = i + 1 self._remove_unused_var() @@ -171,7 +171,7 @@ class InferenceTranspiler(object): # fuse batch_norm self._fuse_param(current_op, next_op, bias_op, 0) # remove batch_norm_op - self.block.remove_op(i + 2) + self.block._remove_op(i + 2) i = i + 1 # conv2d with bias, the next_op.type is elementwise_add elif (next_op.type == 'elementwise_add'): @@ -180,7 +180,7 @@ class InferenceTranspiler(object): # fuse batch_norm self._fuse_param(current_op, next_next_op, next_op, 1) # remove batch_norm_op - self.block.remove_op(i + 2) + self.block._remove_op(i + 2) i = i + 1 i = i + 1 @@ -212,7 +212,7 @@ class InferenceTranspiler(object): y_var = self.block.var(bn_op.input("Bias")[0]) out_var = self.block.var(bn_op.output("Y")[0]) - bias_op = self.block.insert_op( + bias_op = self.block._insert_op( index, type="elementwise_add", inputs={"X": x_var, @@ -307,4 +307,4 @@ class InferenceTranspiler(object): for var in self.block.vars.keys(): if var not in args: - self.block.remove_var(var) + self.block._remove_var(var) diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index 999ef43ca0feacbddff5f9db59589ce7097fe77e..dd90d66110e6233806b04bb726636a915f2ad84a 100644 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -177,7 +177,7 @@ class ControlFlowGraph(object): in_diff) if can_optimize: index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1 - delete_op = block_desc.insert_op(index) + delete_op = block_desc._insert_op(index) delete_op.set_type("delete_var") delete_op.set_input("X", can_optimize) if is_forward: diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py index 3b059735a924d58714cd88a761eb83143f1192d6..678026cf95970e8ff58c1bad20246059ffb464c1 100644 --- a/python/paddle/reader/__init__.py +++ b/python/paddle/reader/__init__.py @@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator: TODO(yuyang18): Should we add whole design doc here? """ -import decorator -from decorator import * +import paddle.reader.decorator +from paddle.reader.decorator import * -import creator +import paddle.reader.creator __all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 1f83cabb8481451736944823be45185deea4f43b..4b1fe94222d35f8c0e4e4cccc364227a3f9509d0 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -20,7 +20,7 @@ __all__ = [ from threading import Thread import subprocess -from Queue import Queue +from six.moves.queue import Queue import itertools import random import zlib diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py index 9235c41e9eb95b25a0dc53a494a203e7a4525981..08d8bd68f9b7eb703c15f7cb5ad1300969db5713 100644 --- a/python/paddle/v2/image.py +++ b/python/paddle/v2/image.py @@ -182,7 +182,7 @@ def resize_short(im, size): h_new = size * h / w else: w_new = size * w / h - im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC) return im @@ -324,7 +324,6 @@ def simple_transform(im, if np.random.randint(2) == 0: im = left_right_flip(im, is_color) else: - im = center_crop(im, crop_size, is_color) im = center_crop(im, crop_size, is_color=is_color) if len(im.shape) == 3: im = to_chw(im) diff --git a/python/requirements.txt b/python/requirements.txt index ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816..c091ecb111bda9d5e83c3ddcae93aed0745f9e4c 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,4 +8,4 @@ scipy>=0.19.0 Pillow nltk>=3.2.2 graphviz -LinkChecker +six diff --git a/python/setup.py.in b/python/setup.py.in index 52138b414e3d908e7aa589e76fe924e138e54d83..a81cd19e10153be0d07badfa0c0fbcb01fe568f7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -17,7 +17,8 @@ def git_commit(): git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() except: git_commit = 'Unknown' - return git_commit + git_commit = git_commit.decode() + return str(git_commit) def _get_version_detail(idx): assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \ @@ -42,12 +43,13 @@ def get_patch(): def is_taged(): try: - cmd = ['git', 'describe', '--exact-match', '--tags'] + cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'] git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() + git_tag = git_tag.decode() except: return False - if git_tag.replace('v', '') == '@PADDLE_VERSION@': + if str(git_tag).replace('v', '') == '@PADDLE_VERSION@': return True else: return False @@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s' def show(): if istaged: - print 'full_version:', full_version - print 'major:', major - print 'minor:', minor - print 'patch:', patch - print 'rc:', rc + print('full_version:', full_version) + print('major:', major) + print('minor:', minor) + print('patch:', patch) + print('rc:', rc) else: - print 'commit:', commit + print('commit:', commit) def mkl(): return with_mkl @@ -181,6 +183,14 @@ else: command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" if os.system(command) != 0: raise Exception("patch core.so failed, command: %s" % command) +if '${WITH_FLUID_ONLY}'== 'OFF': + # change rpath of _swig_paddle.so. + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + else: + command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + if os.system(command) != 0: + raise Exception("patch _swig_paddle.so failed, command: %s" % command) setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', diff --git a/tools/check_pr_approval.py b/tools/check_pr_approval.py new file mode 100644 index 0000000000000000000000000000000000000000..937b0be7562fab93157c16b942631f0a580dfc68 --- /dev/null +++ b/tools/check_pr_approval.py @@ -0,0 +1,49 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import json + + +def check_approval(count, required_reviewers): + json_buff = "" + for line in sys.stdin: + json_buff = "".join([json_buff, line]) + json_resp = json.loads(json_buff) + approves = 0 + approved_user_ids = [] + for review in json_resp: + if review["state"] == "APPROVED": + approves += 1 + approved_user_ids.append(review["user"]["id"]) + + # convert to int + required_reviewers_int = set() + for rr in required_reviewers: + required_reviewers_int.add(int(rr)) + + if len(set(approved_user_ids) & required_reviewers_int) >= count: + print("TRUE") + else: + print("FALSE") + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1].isdigit(): + check_approval(int(sys.argv[1]), sys.argv[2:]) + else: + print( + "Usage: python check_pr_approval.py [count] [required reviewer id] ..." + )