提交 7b7a4afa 编写于 作者: Z Zhen Wang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-default-value

Change the default value of the parameter 'drop_last' in 'paddle.batch' to False.
...@@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) ...@@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
# PY_VERSION
if(NOT PY_VERSION)
set(PY_VERSION 2.7)
endif()
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
...@@ -146,6 +152,7 @@ endif() ...@@ -146,6 +152,7 @@ endif()
######################################################################################## ########################################################################################
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
include(external/libxsmm) # download, build, install libxsmm
include(external/zlib) # download, build, install zlib include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog include(external/glog) # download, build, install glog
...@@ -232,6 +239,10 @@ if(WITH_MKLML) ...@@ -232,6 +239,10 @@ if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif() endif()
if(WITH_LIBXSMM)
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif() endif()
...@@ -271,7 +282,3 @@ if(WITH_DOC) ...@@ -271,7 +282,3 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED) find_python_module(recommonmark REQUIRED)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()
...@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \ ...@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
pip install opencv-python pip install opencv-python
#For docstring checker #For docstring checker
RUN pip install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt RUN pip install -r /root/requirements.txt
......
...@@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
# generate fake: # generate fake:
if args.use_fake_data: if args.use_fake_data:
for var in feed_var_list: for var in feed_var_list:
v = startup_prog.global_block().clone_variable(var) v = startup_prog.global_block()._clone_variable(var)
var.persistable = True var.persistable = True
v.persistable = True v.persistable = True
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
IF(NOT WITH_LIBXSMM)
return()
ENDIF()
IF(WIN32 OR APPLE OR ANDROID OR IOS)
MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
return()
ENDIF()
INCLUDE (ExternalProject)
SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
"${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
ExternalProject_Add(
extern_libxsmm
GIT_REPOSITORY "https://github.com/hfp/libxsmm.git"
GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
PREFIX ${LIBXSMM_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
INSTALL_COMMAND ""
)
ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm)
LIST(APPEND external_project_dependencies libxsmm)
...@@ -121,6 +121,11 @@ ELSE() ...@@ -121,6 +121,11 @@ ELSE()
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
IF(WITH_LIBXSMM)
TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
ADD_DEPENDENCIES(cblas extern_libxsmm)
ENDIF()
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas) LIST(APPEND external_project_dependencies cblas)
......
...@@ -18,8 +18,9 @@ ENDIF() ...@@ -18,8 +18,9 @@ ENDIF()
INCLUDE(python_module) INCLUDE(python_module)
FIND_PACKAGE(PythonInterp 2.7) FIND_PACKAGE(PythonInterp ${PY_VERSION})
FIND_PACKAGE(PythonLibs 2.7) FIND_PACKAGE(PythonLibs ${PY_VERSION})
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL) ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
......
...@@ -138,25 +138,24 @@ copy(memory_lib ...@@ -138,25 +138,24 @@ copy(memory_lib
set(inference_deps paddle_fluid_shared paddle_fluid) set(inference_deps paddle_fluid_shared paddle_fluid)
if(WITH_CONTRIB) set(module "inference/api")
message(STATUS "installing contrib") if (WITH_ANAKIN AND WITH_GPU)
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference") copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
if (WITH_ANAKIN AND WITH_GPU)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin) DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps contrib_anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir})
list(APPEND inference_deps contrib_inference_lib)
endif() endif()
copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${src_dir}/${module}/paddle_inference_api.h
${src_dir}/${module}/demo_ci
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
)
list(APPEND inference_deps inference_api_lib)
set(module "inference") set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
......
...@@ -98,13 +98,13 @@ class Block(objects): ...@@ -98,13 +98,13 @@ class Block(objects):
def append_operator(self, ...): def append_operator(self, ...):
self.ops.append(Operator(self, ...)) self.ops.append(Operator(self, ...))
def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
self.ops.prepend(Operator(self, ...)) self.ops.prepend(Operator(self, ...))
``` ```
`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. `_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
### Operator ### Operator
......
...@@ -78,7 +78,7 @@ def error_clip_callback(block, context): ...@@ -78,7 +78,7 @@ def error_clip_callback(block, context):
op_desc = block.desc.op(block.desc.op_size() - 1) op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n), for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()): op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n]) fwd_var = block.__var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None) error_clip = getattr(fwd_var, "error_clip", None)
if not (error_clip is None or isinstance(error_clip, if not (error_clip is None or isinstance(error_clip,
BaseErrorClipAttr)): BaseErrorClipAttr)):
......
...@@ -4,7 +4,6 @@ API ...@@ -4,7 +4,6 @@ API
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
overview.rst
model_configs.rst model_configs.rst
data.rst data.rst
run_logic.rst run_logic.rst
...@@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安 ...@@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
# 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像 # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
docker build -t paddle:dev . docker build -t paddle:dev .
# 3. 执行下面的命令编译CPU-Only的二进制 # 3. 执行下面的命令编译CPU-Only的二进制
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步) # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。 注:
- 上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: 编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
......
...@@ -36,13 +36,18 @@ If you don't wish to use docker,you need to install several compile dependenci ...@@ -36,13 +36,18 @@ If you don't wish to use docker,you need to install several compile dependenci
# 2. Optional: build development docker image from source # 2. Optional: build development docker image from source
docker build -t paddle:dev . docker build -t paddle:dev .
# 3. Run the following command to build a CPU-Only binaries # 3. Run the following command to build a CPU-Only binaries
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. Or, use your built Docker image to build PaddlePaddle (must run step 2) # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
NOTE: The above command try to mount the current working directory (root directory of source code) NOTE:
- The above command try to mount the current working directory (root directory of source code)
into :code:`/paddle` directory inside docker container. into :code:`/paddle` directory inside docker container.
- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
When the compile finishes, you can get the output whl package under When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local build/python/dist, then you can choose to install the whl on local
machine or copy it to the target machine. machine or copy it to the target machine.
......
...@@ -118,7 +118,7 @@ class Float16Transpiler: ...@@ -118,7 +118,7 @@ class Float16Transpiler:
for var in self.block.vars.keys(): for var in self.block.vars.keys():
if var not in args: if var not in args:
self.block.remove_var(var) self.block._remove_var(var)
def _modify_feed_fetch(self): def _modify_feed_fetch(self):
''' '''
...@@ -165,7 +165,7 @@ class Float16Transpiler: ...@@ -165,7 +165,7 @@ class Float16Transpiler:
dtype=core.VarDesc.VarType.FP16, dtype=core.VarDesc.VarType.FP16,
shape=var.shape, shape=var.shape,
persistable=var.persistable) persistable=var.persistable)
self.block.insert_op( self.block._insert_op(
i + 1, i + 1,
type="cast", type="cast",
inputs={"X": var}, inputs={"X": var},
...@@ -188,7 +188,7 @@ class Float16Transpiler: ...@@ -188,7 +188,7 @@ class Float16Transpiler:
persistable=var.persistable) persistable=var.persistable)
find_op(var) find_op(var)
var.op.rename_output(var_name, tmp_var_name) var.op.rename_output(var_name, tmp_var_name)
self.block.insert_op( self.block._insert_op(
i, i,
type="cast", type="cast",
inputs={"X": tmp_var}, inputs={"X": tmp_var},
...@@ -253,4 +253,4 @@ class Float16Transpiler: ...@@ -253,4 +253,4 @@ class Float16Transpiler:
# old var will be replaced by the fp16 var in program desc # old var will be replaced by the fp16 var in program desc
self.input_map[var.name] = fp16_var_name self.input_map[var.name] = fp16_var_name
self.block.remove_var(var.name) self.block._remove_var(var.name)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
inference_api_test(simple_on_word2vec ARGS test_word2vec)
option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
if(NOT WITH_INFERENCE_DEMO)
return()
endif()
set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
function(inference_download_test_demo TARGET)
if (NOT WITH_TESTING)
return()
endif()
set(options "")
set(oneValueArgs URL)
set(multiValueArgs SRCS)
cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
message(STATUS "inference demo ${test_dir}")
if(NOT EXISTS "${test_dir}")
message(STATUS "Download ${TARGET} model from ${tests_URL}")
execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
endif()
cc_test(${TARGET} SRCS "${tests_SRCS}"
DEPS paddle_inference_api paddle_fluid
ARGS --data=${test_dir}/data.txt
--modeldir=${test_dir}/model
--refer=${test_dir}/result.txt)
endfunction()
# disable mobilenet test
#inference_download_test_demo(mobilenet_inference_demo
# SRCS vis_demo.cc
# URL ${URL_ROOT}mobilenet.tar.gz)
inference_download_test_demo(se_resnext50_inference_demo
SRCS vis_demo.cc
URL ${URL_ROOT}se_resnext50.tar.gz)
inference_download_test_demo(ocr_inference_demo
SRCS vis_demo.cc
URL ${URL_ROOT}ocr.tar.gz)
# Infernce Demos
Input data format:
- Each line contains a single record
- Each record's format is
```
<space splitted floats as data>\t<space splitted ints as shape>
```
Follow the C++ codes in `vis_demo.cc`.
## MobileNet
To execute the demo, simply run
```sh
./mobilenet_inference_demo --modeldir <model> --data <datafile>
```
## SE-ResNeXt-50
To execute the demo, simply run
```sh
./se_resnext50_inference_demo --modeldir <model> --data <datafile>
```
## OCR
To execute the demo, simply run
```sh
./ocr_inference_demo --modeldir <model> --data <datafile>
```
paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False))
paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.copy_data_info_from ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.prune ArgSpec(args=['self', 'targets'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.rollback ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None)
paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10))
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
paddle.fluid.initializer.XavierInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0))
paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'use_mkldnn', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, False, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid'))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, False, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'use_mkldnn', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'use_mkldnn', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, False, None, None, None, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_max ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False))
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None))
paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.fill_constant_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0))
paddle.fluid.layers.fill_constant ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.argmin ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.argmax ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.DynamicRNN.memory ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32'))
paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.read_input ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ParallelDo.write_output ArgSpec(args=['self', 'var'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Print ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both'))
paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.scale ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_add ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_div ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_sub ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_max ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_min ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elementwise_pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.exp ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.tanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.tanh_shrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sqrt ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.abs ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.ceil ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.floor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.cos ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sin ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.round ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.reciprocal ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.square ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.softplus ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.softsign ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.brelu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.leaky_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.soft_relu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.elu ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.relu6 ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.pow ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.stanh ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.hard_sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.swish ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False))
paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.HashName.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True, False))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None)
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core.LoDTensor) -> None
paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None
paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,))
paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeeder.feed_parallel ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.clip.ErrorClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.clip.GradientClipByValue.__init__ ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.clip.GradientClipByNorm.__init__ ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None)
paddle.fluid.clip.GradientClipByGlobalNorm.__init__ ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',))
paddle.fluid.profiler.cuda_profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.profiler.reset_profiler ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.profiler.profiler ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.profiler.start_profiler ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None)
paddle.fluid.profiler.stop_profiler ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile'))
paddle.fluid.unique_name.generate ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None)
paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
...@@ -276,6 +276,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -276,6 +276,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} }
} }
bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA
use_gpu = nccl_ctxs_ != nullptr;
#endif
if (use_gpu ||
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
// Insert BCast Ops // Insert BCast Ops
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id]; auto &to_bcast_set = bcast_var_name_set[dev_id];
...@@ -283,6 +290,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -283,6 +290,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
CreateBroadcastOp(&result, bcast_name, dev_id); CreateBroadcastOp(&result, bcast_name, dev_id);
} }
} }
}
/* /*
Dependency graph has been constructed. However, there are still data Dependency graph has been constructed. However, there are still data
hazards need to be handled. hazards need to be handled.
...@@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { ...@@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1; return -1;
} }
int op_role = boost::get<int>(
op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
return -1;
}
auto param_grad = boost::get<std::vector<std::string>>(
op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
for (auto &varname : op.InputArgumentNames()) { PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(varname); int dev_id = GetVarDeviceID(param_grad[1]);
if (dev_id != -1) { PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
param_grad[0]);
return dev_id; return dev_id;
}
}
return -1;
} }
int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include <stdexcept>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
...@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
} }
} }
} }
std::vector<framework::LoDTensor> fetch_data;
std::exception_ptr eptr;
try {
fetch_data = underlying_executor_->Run(fetch_tensors);
} catch (...) {
eptr = std::current_exception();
}
auto fetch_data = underlying_executor_->Run(fetch_tensors);
drop_scope_counter_ += 1; drop_scope_counter_ += 1;
if (!fetch_tensors.empty() || if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
...@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
} }
if (eptr) {
std::rethrow_exception(eptr);
} else {
return fetch_data; return fetch_data;
}
} }
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
set.clear(); set.clear();
}; };
// Clean run context
run_op_futures_.clear();
exception_.reset();
// Step 3. Execution // Step 3. Execution
while (!pending_vars.empty()) { while (!pending_vars.empty()) {
// 1. Run All Ready ops // 1. Run All Ready ops
...@@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout); auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) { if (timeout) {
std::lock_guard<std::mutex> l(exception_mu_); std::unique_lock<std::mutex> l(exception_mu_);
if (exception_) { if (exception_) {
l.unlock();
for (auto &run_op_future : run_op_futures_) {
run_op_future.wait();
}
l.lock();
std::exception *exp = exception_.get(); std::exception *exp = exception_.get();
if (dynamic_cast<platform::EOFException *>(exp)) { if (dynamic_cast<platform::EOFException *>(exp)) {
auto e = *static_cast<platform::EOFException *>(exp); auto e = *static_cast<platform::EOFException *>(exp);
exception_.reset();
throw e; throw e;
} else if (dynamic_cast<platform::EnforceNotMet *>(exp)) { } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
auto e = *static_cast<platform::EnforceNotMet *>(exp); auto e = *static_cast<platform::EnforceNotMet *>(exp);
exception_.reset();
throw e; throw e;
} else { } else {
LOG(FATAL) << "Unknown exception."; LOG(FATAL) << "Unknown exception.";
...@@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp(
} }
}; };
if (pool_) { if (pool_) {
pool_->enqueue(op_run); run_op_futures_.emplace_back(pool_->enqueue(op_run));
} else { } else {
op_run(); op_run();
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <deque> #include <deque>
#include <list>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
...@@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
private: private:
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
// use std::list because clear(), push_back, and for_each are O(1)
std::list<std::future<void>> run_op_futures_;
}; };
} // namespace details } // namespace details
......
...@@ -45,6 +45,7 @@ class ParallelExecutorPrivate { ...@@ -45,6 +45,7 @@ class ParallelExecutorPrivate {
#endif #endif
bool own_local_scope_; bool own_local_scope_;
bool use_cuda_; bool use_cuda_;
bool use_all_reduce_;
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
...@@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor(
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1,
"If you set build_strategy.reduce with 'Reduce',"
"the number of places must be greater than 1.");
}
// Step 1. Bcast the params to devs. // Step 1. Bcast the params to devs.
// Create local scopes // Create local scopes
...@@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
BCastParamsToGPUs(bcast_vars); BCastParamsToDevices(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor(
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA.");
#endif #endif
} }
...@@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor( ...@@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor(
member_->places_, std::move(member_->executor_))); member_->places_, std::move(member_->executor_)));
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const { const std::unordered_set<std::string> &vars) const {
// the the initializing bcast, all vars would be bcast from device(0), // the initializing bcast, all vars would be bcast from device(0),
// otherwise // otherwise
// bcast from the specified device. // bcast from the specified device.
bool initializing = builder_.get() == nullptr ? true : false; bool initializing = builder_.get() == nullptr ? true : false;
...@@ -202,12 +211,23 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -202,12 +211,23 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif #endif
} else { } else {
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
if ((initializing && i == 0) ||
(!initializing && static_cast<int>(i) == var_dev_id))
continue;
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
// FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
if (member_->use_all_reduce_ || member_->use_cuda_ ||
var == "@LR_DECAY_COUNTER@") {
t->Resize(dims); t->Resize(dims);
t->mutable_data(cpu, main_tensor.type()); t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t); paddle::framework::TensorCopy(main_tensor, cpu, t);
} else {
t->ShareDataWith(main_tensor);
}
} }
} }
} }
......
...@@ -66,7 +66,7 @@ class ParallelExecutor { ...@@ -66,7 +66,7 @@ class ParallelExecutor {
void Run(const std::vector<std::string> &fetch_tensors, void Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name); const std::string &fetched_var_name);
void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const; void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
private: private:
ParallelExecutorPrivate *member_; ParallelExecutorPrivate *member_;
......
...@@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped }; ...@@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped };
class ReaderBase { class ReaderBase {
public: public:
void ReadNext(std::vector<LoDTensor>* out); virtual void ReadNext(std::vector<LoDTensor>* out);
void Shutdown(); virtual void Shutdown();
void Start(); virtual void Start();
// Return the readers which are the end of decorating chain. Basically // Return the readers which are the end of decorating chain. Basically
// they are readers just before read op. // they are readers just before read op.
...@@ -42,7 +42,7 @@ class ReaderBase { ...@@ -42,7 +42,7 @@ class ReaderBase {
virtual ~ReaderBase(); virtual ~ReaderBase();
protected: protected:
virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0; virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
virtual void ShutdownImpl() {} virtual void ShutdownImpl() {}
......
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor ) # analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
...@@ -7,12 +14,14 @@ cc_library(paddle_fluid_api ...@@ -7,12 +14,14 @@ cc_library(paddle_fluid_api
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
if(WITH_CONTRIB)
set(fluid_modules "${fluid_modules}" paddle_inference_api)
endif()
# Create static library # Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api) cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
SRCS io.cc SRCS io.cc
...@@ -29,9 +38,4 @@ if(WITH_TESTING) ...@@ -29,9 +38,4 @@ if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book) add_subdirectory(tests/book)
endif() endif()
add_subdirectory(api)
add_subdirectory(analysis)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()
...@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const { ...@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
return dot.Build(); return dot.Build();
} }
std::string DataFlowGraph::HumanReadableInfo(bool show_values,
bool show_functions) const {
std::stringstream values, functions;
for (auto &n : nodes.nodes()) {
if (show_values && n->IsValue()) {
values << n->repr() << "\n";
}
if (show_functions && n->IsFunction()) {
functions << n->repr() << "\n";
}
}
return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
}
// //
// NodesBFSIterator // NodesBFSIterator
// //
...@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() { ...@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
return stack_.top(); return stack_.top();
} }
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
inlink_visited.clear();
std::copy_if(p->inlinks.begin(), p->inlinks.end(),
std::back_inserter(inlink_visited),
[&](Node *x) { return visited.count(x); });
if (inlink_visited.size() == p->inlinks.size()) {
sorted_.push_back(p);
for (auto *_ : p->outlinks) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
&GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
const paddle::inference::analysis::GraphTraits<
DataFlowGraph>::NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -48,6 +48,9 @@ struct DataFlowGraph { ...@@ -48,6 +48,9 @@ struct DataFlowGraph {
// Output a DOT graph file for debug. // Output a DOT graph file for debug.
std::string DotString() const; std::string DotString() const;
std::string HumanReadableInfo(bool show_values = true,
bool show_functions = true) const;
private: private:
// Remove duplicate edges and so on. // Remove duplicate edges and so on.
void Clean(); void Clean();
...@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> { ...@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
std::unordered_set<Node *> visited_; std::unordered_set<Node *> visited_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, Node *> {
NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
Node *operator->();
private:
std::vector<Node *> sorted_;
int cursor_{0};
};
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
// default use BFS to visit the nodes. // default use BFS to visit the nodes.
...@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> { ...@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
iterator_range<NodesDFSIterator> nodes_in_DFS() { iterator_range<NodesDFSIterator> nodes_in_DFS() {
return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end()); return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
} }
iterator_range<NodesTSIterator> nodes_in_TS() {
return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
}
private: private:
NodesBFSIterator nodes_bfs_begin() { NodesBFSIterator nodes_bfs_begin() {
return NodesBFSIterator(graph_->inputs); return NodesBFSIterator(graph_->inputs);
} }
NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
NodesDFSIterator nodes_dfs_begin() { NodesDFSIterator nodes_dfs_begin() {
return NodesDFSIterator(graph_->inputs); return NodesDFSIterator(graph_->inputs);
} }
NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
private: private:
DataFlowGraph *graph_; DataFlowGraph *graph_;
}; };
......
...@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) { ...@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
dfg.Build(); dfg.Build();
for (auto* in : dfg.inputs) { for (auto *in : dfg.inputs) {
LOG(INFO) << "inputs: " << in->name() << " " LOG(INFO) << "inputs: " << in->name() << " "
<< static_cast<int>(in->type()); << static_cast<int>(in->type());
} }
for (auto* out : dfg.outputs) { for (auto *out : dfg.outputs) {
LOG(INFO) << "outputs: " << out->name() << " " LOG(INFO) << "outputs: " << out->name() << " "
<< static_cast<int>(out->type()); << static_cast<int>(out->type());
} }
...@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) { ...@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
ASSERT_EQ(count, dfg.nodes.size()); ASSERT_EQ(count, dfg.nodes.size());
} }
// Topological sorting.
/*
* Graph topology
* inputs: 0, 1, 2
* 0 -> 4
* 0 -> 5
* 1 -> 6
* 2 -> 7
* 4 -> 5
* 4 -> 7
* 4 -> 3
* 7 -> 3
*/
TEST(DataFlowGraph, TS) {
DataFlowGraph graph;
for (int i = 0; i < 8; i++) {
auto *node = graph.nodes.Create(Node::Type::kValue);
node->SetName("node-" + std::to_string(i));
}
auto add_link = [&](int i, int j) {
Node *source = graph.nodes.GetMutable(i);
Node *target = graph.nodes.GetMutable(j);
target->inlinks.push_back(source);
source->outlinks.push_back(target);
};
graph.inputs.push_back(graph.nodes.GetMutable(0));
graph.inputs.push_back(graph.nodes.GetMutable(1));
graph.inputs.push_back(graph.nodes.GetMutable(2));
add_link(0, 4);
add_link(0, 5);
add_link(1, 6);
add_link(2, 7);
add_link(4, 5);
add_link(4, 7);
add_link(4, 3);
add_link(7, 3);
auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
std::vector<int> sorted_ids;
for (auto it = its.begin(); it != its.end(); ++it) {
LOG(INFO) << it->name();
sorted_ids.push_back(it->id());
}
// Assert a occurs prior to b in the sorted_ids.
auto assert_positive_sequence_pair = [&](int a, int b) {
auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
ASSERT_LT(a_offset, b_offset);
};
assert_positive_sequence_pair(2, 7);
assert_positive_sequence_pair(7, 3);
assert_positive_sequence_pair(4, 3);
assert_positive_sequence_pair(0, 4);
assert_positive_sequence_pair(0, 5);
assert_positive_sequence_pair(1, 6);
assert_positive_sequence_pair(4, 5);
assert_positive_sequence_pair(4, 7);
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -43,53 +43,64 @@ function(inference_api_test TARGET_NAME) ...@@ -43,53 +43,64 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc SRCS api.cc api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
if(NOT APPLE)
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
# Here the shared library doesn't depend on other fluid libraries, or double free will occur. # Here the shared library doesn't depend on other fluid libraries, or double free will occur.
cc_library(paddle_inference_api_shared SHARED cc_library(paddle_inference_api_shared SHARED
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc) SRCS api.cc api_impl.cc)
add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api) set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
if(NOT APPLE) if(NOT APPLE)
set(LINK_FLAGS "-fPIC -fvisibility=hidden") set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
"execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
" ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
"if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
" message(FATAL_ERROR \"Check symbol failed.\")\n"
"endif()\n")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
DEPENDS paddle_inference_api_shared)
add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
endif() endif()
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc SRCS test_api.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
inference_api_test(test_paddle_inference_api_impl inference_api_test(test_api_impl
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS paddle_inference_api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api) DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec) inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
endif() endif()
if (WITH_ANAKIN) # only needed in CI if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so. # compile the libinference_anakin_api.a and compile with anakin.so.
nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common) target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
if (WITH_TESTING) if (WITH_TESTING)
cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api) DEPS inference_anakin_api)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING) endif(WITH_TESTING)
endif() endif()
if(WITH_TESTING)
add_subdirectory(demo)
endif()
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
......
{
global:
*paddle*;
local:
*;
};
...@@ -12,8 +12,9 @@ ...@@ -12,8 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h" #include "paddle/fluid/inference/api/api_anakin_engine.h"
#include <cuda.h> #include <cuda.h>
#include <vector>
namespace paddle { namespace paddle {
...@@ -47,13 +48,13 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -47,13 +48,13 @@ bool PaddleInferenceAnakinPredictor::Run(
} }
auto d_tensor_in_p = executor_.get_in(input.name); auto d_tensor_in_p = executor_.get_in(input.name);
float *d_data_p = d_tensor_in_p->mutable_data(); float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p, if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float), d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) { cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error"; LOG(ERROR) << "copy data from CPU to GPU error";
return false; return false;
} }
cudaStreamSynchronize(NULL);
} }
executor_.prediction(); executor_.prediction();
...@@ -69,13 +70,13 @@ bool PaddleInferenceAnakinPredictor::Run( ...@@ -69,13 +70,13 @@ bool PaddleInferenceAnakinPredictor::Run(
output.data.Resize(tensor->valid_size() * sizeof(float)); output.data.Resize(tensor->valid_size() * sizeof(float));
} }
// Copy data from GPU -> CPU // Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data(), if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
tensor->mutable_data(),
tensor->valid_size() * sizeof(float), tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) { cudaMemcpyDeviceToHost) != 0) {
LOG(ERROR) << "copy data from GPU to CPU error"; LOG(ERROR) << "copy data from GPU to CPU error";
return false; return false;
} }
cudaStreamSynchronize(NULL);
} }
return true; return true;
} }
...@@ -104,13 +105,12 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() { ...@@ -104,13 +105,12 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
// A factory to help create difference predictor. // A factory to help create difference predictor.
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>( AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create."; VLOG(3) << "Anakin Predictor create.";
std::unique_ptr<PaddlePredictor> x( std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor(config)); new PaddleInferenceAnakinPredictor(config));
return x; return x;
}; }
} // namespace paddle } // namespace paddle
...@@ -19,7 +19,8 @@ limitations under the License. */ ...@@ -19,7 +19,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/contrib/inference/paddle_inference_api.h" #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin // from anakin
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
...@@ -31,7 +32,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -31,7 +32,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public: public:
PaddleInferenceAnakinPredictor() {} PaddleInferenceAnakinPredictor() {}
PaddleInferenceAnakinPredictor(const AnakinConfig& config); explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
// NOTE Unlike the native engine, the buffers of anakin engine's output_data // NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first. // should be allocated first.
...@@ -48,8 +49,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -48,8 +49,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
private: private:
bool Init(const AnakinConfig& config); bool Init(const AnakinConfig& config);
anakin::graph::Graph<anakin::NV, anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
anakin::saber::AK_FLOAT,
anakin::Precision::FP32> anakin::Precision::FP32>
graph_; graph_;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32> anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model."); DEFINE_string(model, "", "Directory of the inference model.");
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
namespace paddle { namespace paddle {
namespace { namespace {
...@@ -77,8 +77,8 @@ bool NativePaddlePredictor::Init( ...@@ -77,8 +77,8 @@ bool NativePaddlePredictor::Init(
if (!config_.model_dir.empty()) { if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in // Parameters are saved in separate files sited in
// the specified `dirname`. // the specified `dirname`.
inference_program_ = paddle::inference::Load( inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
executor_.get(), scope_.get(), config_.model_dir); config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) { } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// The file names should be consistent with that used // The file names should be consistent with that used
...@@ -91,8 +91,8 @@ bool NativePaddlePredictor::Init( ...@@ -91,8 +91,8 @@ bool NativePaddlePredictor::Init(
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
executor_->CreateVariables( executor_->CreateVariables(*inference_program_,
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names // Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames(); feed_target_names_ = inference_program_->GetFeedTargetNames();
...@@ -105,7 +105,7 @@ NativePaddlePredictor::~NativePaddlePredictor() { ...@@ -105,7 +105,7 @@ NativePaddlePredictor::~NativePaddlePredictor() {
PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!"); PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
scope_->DeleteScope(sub_scope_); scope_->DeleteScope(sub_scope_);
} }
}; }
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) { std::vector<PaddleTensor> *output_data) {
...@@ -134,10 +134,8 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -134,10 +134,8 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
// if share variables, we need not create variables // if share variables, we need not create variables
VLOG(4) << "Run prepared context"; VLOG(4) << "Run prepared context";
executor_->RunPreparedContext( executor_->RunPreparedContext(
ctx_.get(), ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
sub_scope_ != nullptr ? sub_scope_ : scope_.get(), &feed_targets, &fetch_targets,
&feed_targets,
&fetch_targets,
false /* don't create variable eatch time */); false /* don't create variable eatch time */);
VLOG(4) << "Finish prepared context"; VLOG(4) << "Finish prepared context";
if (!GetFetch(fetchs, output_data)) { if (!GetFetch(fetchs, output_data)) {
...@@ -181,8 +179,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -181,8 +179,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
} }
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
inputs[i].data.data(),
inputs[i].data.length()); inputs[i].data.length());
feeds->push_back(input); feeds->push_back(input);
} }
...@@ -232,8 +229,7 @@ bool NativePaddlePredictor::GetFetch( ...@@ -232,8 +229,7 @@ bool NativePaddlePredictor::GetFetch(
size_t start = lod[0][j - 1] * common_dim; size_t start = lod[0][j - 1] * common_dim;
size_t end = lod[0][j] * common_dim; size_t end = lod[0][j] * common_dim;
if (end > start) { if (end > start) {
std::copy(output_ptr + start, std::copy(output_ptr + start, output_ptr + end,
output_ptr + end,
data.begin() + (j - 1) * max_dim * common_dim); data.begin() + (j - 1) * max_dim * common_dim);
} }
} }
...@@ -257,15 +253,13 @@ bool NativePaddlePredictor::GetFetch( ...@@ -257,15 +253,13 @@ bool NativePaddlePredictor::GetFetch(
} }
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>( NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor"; VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory, config.fraction_of_gpu_memory, 0.f,
0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags; std::vector<std::string> flags;
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
...@@ -77,8 +77,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -77,8 +77,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
executor_->CreateVariables( executor_->CreateVariables(*inference_program_,
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names // Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames(); feed_target_names_ = inference_program_->GetFeedTargetNames();
...@@ -98,8 +98,7 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( ...@@ -98,8 +98,7 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory, config.fraction_of_gpu_memory, 0.f,
0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags; std::vector<std::string> flags;
......
#!/bin/bash
lib=$1
if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
exit 0
cmake_minimum_required(VERSION 3.0)
project(cpp_inference_demo CXX C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
endif()
if(NOT DEFINED DEMO_NAME)
message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
endif()
option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
if(WITH_GPU)
set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
endif()
include_directories("${PADDLE_LIB}")
include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
include_directories("${PADDLE_LIB}/third_party/install/glog/include")
include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3")
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH})
include_directories("${MKLDNN_PATH}/include")
set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
endif()
else()
set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB)
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
else()
set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
endif()
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z
${EXTERNAL_LIB})
if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
endif()
target_link_libraries(${DEMO_NAME} ${DEPS})
# Inference Demos
There are several demos:
- simple_on_word2vec:
- Follow the C++ codes is in `simple_on_word2vec.cc`.
- It is suitable for word2vec model.
- vis_demo:
- Follow the C++ codes is in `vis_demo.cc`.
- It is suitable for mobilenet, se_resnext50 and ocr three models.
- Input data format:
- Each line contains a single record
- Each record's format is
```
<space splitted floats as data>\t<space splitted ints as shape>
```
To build and execute the demos, simply run
```
./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU
```
- It will build and execute the demos in both static and shared library.
- `$PADDLE_ROOT`: paddle library path
- `$TURN_ON_MKL`: use MKL or Openblas
- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first.
set -x
PADDLE_ROOT=$1
TURN_ON_MKL=$2 # use MKL or Openblas
TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
if [ $2 == ON ]; then
# You can export yourself if move the install path
MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
fi
if [ $3 == ON ]; then
use_gpu_list='true false'
else
use_gpu_list='false'
fi
# download vis_demo data
function download() {
dir_name=$1
mkdir -p $dir_name
cd $dir_name
wget -q ${URL_ROOT}$dir_name.tar.gz
tar xzf *.tar.gz
cd ..
}
URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
mkdir -p data
cd data
vis_demo_list='se_resnext50 ocr mobilenet'
for vis_demo_name in $vis_demo_list; do
download $vis_demo_name
done
cd ..
# compile and test the demo
mkdir -p build
cd build
for WITH_STATIC_LIB in ON OFF; do
# -----simple_on_word2vec-----
rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=simple_on_word2vec \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j
word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
if [ -d $word2vec_model ]; then
for use_gpu in $use_gpu_list; do
./simple_on_word2vec \
--dirname=$word2vec_model \
--use_gpu=$use_gpu
if [ $? -ne 0 ]; then
echo "simple_on_word2vec demo runs fail."
exit 1
fi
done
fi
# ---------vis_demo---------
rm -rf *
cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
-DWITH_MKL=$TURN_ON_MKL \
-DDEMO_NAME=vis_demo \
-DWITH_GPU=$TEST_GPU_CPU \
-DWITH_STATIC_LIB=$WITH_STATIC_LIB
make -j
for use_gpu in $use_gpu_list; do
for vis_demo_name in $vis_demo_list; do
./vis_demo \
--modeldir=../data/$vis_demo_name/model \
--data=../data/$vis_demo_name/data.txt \
--refer=../data/$vis_demo_name/result.txt \
--use_gpu=$use_gpu
if [ $? -ne 0 ]; then
echo "vis demo $vis_demo_name runs fail."
exit 1
fi
done
done
done
set +x
...@@ -16,21 +16,27 @@ limitations under the License. */ ...@@ -16,21 +16,27 @@ limitations under the License. */
* This file contains a simple demo for how to take a model for inference. * This file contains a simple demo for how to take a model for inference.
*/ */
#include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h>
#include <memory> #include <memory>
#include <thread> #include <thread> //NOLINT
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/fluid/inference/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace paddle { namespace paddle {
namespace demo { namespace demo {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) { void Main(bool use_gpu) {
//# 1. Create PaddlePredictor with a config. //# 1. Create PaddlePredictor with a config.
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; if (FLAGS_dirname.empty()) {
LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model";
exit(1);
}
config.model_dir = FLAGS_dirname;
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.device = 0; config.device = 0;
...@@ -54,12 +60,16 @@ void Main(bool use_gpu) { ...@@ -54,12 +60,16 @@ void Main(bool use_gpu) {
CHECK(predictor->Run(slots, &outputs)); CHECK(predictor->Run(slots, &outputs));
//# 4. Get output. //# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); PADDLE_ENFORCE(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length(); // Check the output buffer size and result of each tid.
PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float); const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]);
} }
} }
} }
...@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -68,7 +78,7 @@ void MainThreads(int num_threads, bool use_gpu) {
// Multi-threads only support on CPU // Multi-threads only support on CPU
// 0. Create PaddlePredictor with a config. // 0. Create PaddlePredictor with a config.
NativeConfig config; NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname;
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.device = 0; config.device = 0;
...@@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -94,14 +104,17 @@ void MainThreads(int num_threads, bool use_gpu) {
CHECK(predictor->Run(inputs, &outputs)); CHECK(predictor->Run(inputs, &outputs));
// 4. Get output. // 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); PADDLE_ENFORCE(outputs.size(), 1UL);
LOG(INFO) << "TID: " << tid << ", " // Check the output buffer size and result of each tid.
<< "output buffer size: " << outputs.front().data.length(); PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
0.000932706};
const size_t num_elements = const size_t num_elements =
outputs.front().data.length() / sizeof(float); outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. // The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
result[i]);
} }
} }
}); });
...@@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) { ...@@ -111,15 +124,18 @@ void MainThreads(int num_threads, bool use_gpu) {
} }
} }
TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
#ifdef PADDLE_WITH_CUDA
TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
#endif
} // namespace demo } // namespace demo
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
paddle::demo::Main(false /* use_gpu*/);
paddle::demo::MainThreads(1, false /* use_gpu*/);
paddle::demo::MainThreads(4, false /* use_gpu*/);
if (FLAGS_use_gpu) {
paddle::demo::Main(true /*use_gpu*/);
paddle::demo::MainThreads(1, true /*use_gpu*/);
paddle::demo::MainThreads(4, true /*use_gpu*/);
}
return 0;
}
...@@ -13,16 +13,15 @@ ...@@ -13,16 +13,15 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/paddle_inference_api.h"
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace demo { namespace demo {
static void split(const std::string& str, static void split(const std::string& str, char sep,
char sep,
std::vector<std::string>* pieces) { std::vector<std::string>* pieces) {
pieces->clear(); pieces->clear();
if (str.empty()) { if (str.empty()) {
......
...@@ -18,26 +18,24 @@ limitations under the License. */ ...@@ -18,26 +18,24 @@ limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. #include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "paddle/contrib/inference/demo/utils.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/contrib/inference/paddle_inference_api.h" #include "utils.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
#endif #endif
namespace paddle {
namespace demo {
DEFINE_string(modeldir, "", "Directory of the inference model."); DEFINE_string(modeldir, "", "Directory of the inference model.");
DEFINE_string(refer, "", "path to reference result for comparison."); DEFINE_string(refer, "", "path to reference result for comparison.");
DEFINE_string( DEFINE_string(
data, data, "",
"",
"path of data; each line is a record, format is " "path of data; each line is a record, format is "
"'<space splitted floats as data>\t<space splitted ints as shape'"); "'<space splitted floats as data>\t<space splitted ints as shape'");
DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace paddle {
namespace demo {
struct Record { struct Record {
std::vector<float> data; std::vector<float> data;
...@@ -47,7 +45,7 @@ struct Record { ...@@ -47,7 +45,7 @@ struct Record {
void split(const std::string& str, char sep, std::vector<std::string>* pieces); void split(const std::string& str, char sep, std::vector<std::string>* pieces);
Record ProcessALine(const std::string& line) { Record ProcessALine(const std::string& line) {
LOG(INFO) << "process a line"; VLOG(3) << "process a line";
std::vector<std::string> columns; std::vector<std::string> columns;
split(line, '\t', &columns); split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL) CHECK_EQ(columns.size(), 2UL)
...@@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) { ...@@ -65,8 +63,8 @@ Record ProcessALine(const std::string& line) {
for (auto& s : shape_strs) { for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s)); record.shape.push_back(std::stoi(s));
} }
LOG(INFO) << "data size " << record.data.size(); VLOG(3) << "data size " << record.data.size();
LOG(INFO) << "data shape size " << record.shape.size(); VLOG(3) << "data shape size " << record.shape.size();
return record; return record;
} }
...@@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) { ...@@ -78,20 +76,22 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
file.close(); file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
LOG(INFO) << "predictor output numel " << numel; VLOG(3) << "predictor output numel " << numel;
LOG(INFO) << "reference output numel " << refer.data.size(); VLOG(3) << "reference output numel " << refer.data.size();
EXPECT_EQ(numel, refer.data.size()); PADDLE_ENFORCE_EQ(numel, refer.data.size());
switch (output.dtype) { switch (output.dtype) {
case PaddleDType::INT64: { case PaddleDType::INT64: {
for (size_t i = 0; i < numel; ++i) { for (size_t i = 0; i < numel; ++i) {
EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]); PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
refer.data[i]);
} }
break; break;
} }
case PaddleDType::FLOAT32: case PaddleDType::FLOAT32:
for (size_t i = 0; i < numel; ++i) { for (size_t i = 0; i < numel; ++i) {
EXPECT_NEAR( PADDLE_ENFORCE_LT(
static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5); fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
1e-5);
} }
break; break;
} }
...@@ -106,15 +106,15 @@ void Main(bool use_gpu) { ...@@ -106,15 +106,15 @@ void Main(bool use_gpu) {
config.prog_file = FLAGS_modeldir + "/__model__"; config.prog_file = FLAGS_modeldir + "/__model__";
config.use_gpu = use_gpu; config.use_gpu = use_gpu;
config.device = 0; config.device = 0;
#ifdef PADDLE_WITH_CUDA if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use; config.fraction_of_gpu_memory = 0.1; // set by yourself
#endif }
LOG(INFO) << "init predictor"; VLOG(3) << "init predictor";
auto predictor = auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
LOG(INFO) << "begin to process data"; VLOG(3) << "begin to process data";
// Just a single batch of data. // Just a single batch of data.
std::string line; std::string line;
std::ifstream file(FLAGS_data); std::ifstream file(FLAGS_data);
...@@ -129,21 +129,26 @@ void Main(bool use_gpu) { ...@@ -129,21 +129,26 @@ void Main(bool use_gpu) {
.data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
.dtype = PaddleDType::FLOAT32}; .dtype = PaddleDType::FLOAT32};
LOG(INFO) << "run executor"; VLOG(3) << "run executor";
std::vector<PaddleTensor> output; std::vector<PaddleTensor> output;
predictor->Run({input}, &output); predictor->Run({input}, &output);
LOG(INFO) << "output.size " << output.size(); VLOG(3) << "output.size " << output.size();
auto& tensor = output.front(); auto& tensor = output.front();
LOG(INFO) << "output: " << SummaryTensor(tensor); VLOG(3) << "output: " << SummaryTensor(tensor);
// compare with reference result // compare with reference result
CheckOutput(FLAGS_refer, tensor); CheckOutput(FLAGS_refer, tensor);
} }
TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
#ifdef PADDLE_WITH_CUDA
TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
#endif
} // namespace demo } // namespace demo
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
paddle::demo::Main(false /* use_gpu*/);
if (FLAGS_use_gpu) {
paddle::demo::Main(true /*use_gpu*/);
}
return 0;
}
...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/contrib/inference/paddle_inference_api.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
......
...@@ -15,10 +15,10 @@ limitations under the License. */ ...@@ -15,10 +15,10 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> #include <thread> // NOLINT
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) { ...@@ -121,8 +121,8 @@ void MainImageClassification(bool use_gpu) {
// which should be in the range [0.0, 1.0]. // which should be in the range [0.0, 1.0].
feed_target_shapes[0][0] = batch_size; feed_target_shapes[0][0] = batch_size;
framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
SetupTensor<float>( SetupTensor<float>(&input, input_dims, static_cast<float>(0),
&input, input_dims, static_cast<float>(0), static_cast<float>(1)); static_cast<float>(1));
std::vector<framework::LoDTensor*> cpu_feeds; std::vector<framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
......
...@@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table) ...@@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor) op_library(parallel_do_op DEPS executor)
op_library(unsqueeze_op DEPS reshape_op)
op_library(squeeze_op DEPS reshape_op)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
...@@ -35,7 +35,14 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,14 @@ class AucOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(inference_height, label_height, PADDLE_ENFORCE_EQ(inference_height, label_height,
"Out and Label should have same height."); "Out and Label should have same height.");
int num_thres = ctx->Attrs().Get<int>("num_thresholds");
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("TPOut", {num_thres});
ctx->SetOutputDim("TNOut", {num_thres});
ctx->SetOutputDim("FPOut", {num_thres});
ctx->SetOutputDim("FNOut", {num_thres});
ctx->ShareLoD("Out", /*->*/ "AUC"); ctx->ShareLoD("Out", /*->*/ "AUC");
} }
...@@ -63,10 +70,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -63,10 +70,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data." "A 2D int tensor indicating the label of the training data."
"The height is batch size and width is always 1."); "The height is batch size and width is always 1.");
AddInput("TP", "True-Positive value.");
AddInput("FP", "False-Positive value.");
AddInput("TN", "True-Negative value.");
AddInput("FN", "False-Negative value.");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("TPOut", "True-Positive value.");
AddOutput("FPOut", "False-Positive value.");
AddOutput("TNOut", "True-Negative value.");
AddOutput("FNOut", "False-Negative value.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
......
...@@ -34,6 +34,12 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -34,6 +34,12 @@ class AucKernel : public framework::OpKernel<T> {
auto* inference = ctx.Input<Tensor>("Out"); auto* inference = ctx.Input<Tensor>("Out");
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
auto* auc = ctx.Output<Tensor>("AUC"); auto* auc = ctx.Output<Tensor>("AUC");
// Only use output var for now, make sure it's persistable and
// not cleaned up for each batch.
auto* true_positive = ctx.Output<Tensor>("TPOut");
auto* false_positive = ctx.Output<Tensor>("FPOut");
auto* true_negative = ctx.Output<Tensor>("TNOut");
auto* false_negative = ctx.Output<Tensor>("FNOut");
float* auc_data = auc->mutable_data<float>(ctx.GetPlace()); float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
...@@ -54,19 +60,10 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -54,19 +60,10 @@ class AucKernel : public framework::OpKernel<T> {
const T* inference_data = inference->data<T>(); const T* inference_data = inference->data<T>();
const int64_t* label_data = label->data<int64_t>(); const int64_t* label_data = label->data<int64_t>();
// Create local tensor for storing the curve: TP, FN, TN, FP auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
// TODO(typhoonzero): use eigen op to caculate these values. auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
Tensor true_positive, false_positive, true_negative, false_negative; auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
true_positive.Resize({num_thresholds});
false_negative.Resize({num_thresholds});
true_negative.Resize({num_thresholds});
false_positive.Resize({num_thresholds});
int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) { for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
// caculate TP, FN, TN, FP for current thresh // caculate TP, FN, TN, FP for current thresh
...@@ -91,10 +88,10 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -91,10 +88,10 @@ class AucKernel : public framework::OpKernel<T> {
} }
} }
// store rates // store rates
tp_data[idx_thresh] = tp; tp_data[idx_thresh] += tp;
fn_data[idx_thresh] = fn; fn_data[idx_thresh] += fn;
tn_data[idx_thresh] = tn; tn_data[idx_thresh] += tn;
fp_data[idx_thresh] = fp; fp_data[idx_thresh] += fp;
} }
// epsilon to avoid divide by zero. // epsilon to avoid divide by zero.
float epsilon = 1e-6; float epsilon = 1e-6;
......
...@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase { ...@@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase {
VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
<< " and dir:" << dir << " to " << epmap[i]; << " and dir:" << dir << " to " << epmap[i];
} }
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
}; };
......
...@@ -18,9 +18,6 @@ ...@@ -18,9 +18,6 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using conv_bwd_data = mkldnn::convolution_backward_data;
using conv_bwd_weights = mkldnn::convolution_backward_weights;
using conv_fwd = mkldnn::convolution_forward;
using framework::DataLayout; using framework::DataLayout;
using mkldnn::memory; using mkldnn::memory;
using mkldnn::primitive; using mkldnn::primitive;
...@@ -29,6 +26,196 @@ using mkldnn::stream; ...@@ -29,6 +26,196 @@ using mkldnn::stream;
using platform::to_void_cast; using platform::to_void_cast;
using platform::GetMKLDNNFormat; using platform::GetMKLDNNFormat;
class ConvMKLDNNHandler : public platform::MKLDNNHandler {
public:
ConvMKLDNNHandler(
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {
conv_pd_ = conv_pd;
}
ConvMKLDNNHandler(
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
conv_bwd_data_pd,
std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
conv_bwd_weights_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
conv_pd_(conv_pd),
conv_bwd_weights_pd_(conv_bwd_weights_pd),
conv_bwd_data_pd_(conv_bwd_data_pd) {
// If we are in Grad operatgor then update a key with BWD suffix to
// distinguish from FWD memory primitives
key_ += "-BWD";
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p,
"@weights-src_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@weights-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
"@diff_weights_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@data-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
auto user_pd = user_weights_memory_p->get_primitive_desc();
return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
"@data-weights_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
"@dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto src_pd = conv_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<mkldnn::convolution_forward>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<mkldnn::convolution_backward_weights>
AcquireConvolutionBackwardWeights(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
auto prim_key = key_ + "@conv_bwd_weights_p";
auto conv_bwd_weights_p =
std::static_pointer_cast<mkldnn::convolution_backward_weights>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd weights primitive in device context");
if (conv_bwd_weights_p == nullptr) {
// create backward conv primitive for weights
conv_bwd_weights_p =
std::make_shared<mkldnn::convolution_backward_weights>(
*conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
*diff_weights_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
} else {
is_reusing_ = true;
}
return conv_bwd_weights_p;
}
std::shared_ptr<mkldnn::convolution_backward_data>
AcquireConvolutionBackwardData(
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
auto prim_key = key_ + "@conv_bwd_data_p";
auto conv_bwd_data_p =
std::static_pointer_cast<mkldnn::convolution_backward_data>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_data_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd data primitive in device context");
if (conv_bwd_data_p == nullptr) {
conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
*conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
*diff_src_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
} else {
is_reusing_ = true;
}
return conv_bwd_data_p;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static std::string GetHash(memory::dims& input_dims,
memory::dims& weights_dims,
std::vector<int>& strides,
std::vector<int>& paddings,
std::vector<int>& dilations, int groups,
const std::string& suffix) {
return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
suffix;
}
private:
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
conv_bwd_weights_pd_;
std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
conv_bwd_data_pd_;
};
template <typename T> template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -36,10 +223,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -36,10 +223,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
// Get unique name for index
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>(); ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
...@@ -80,86 +263,84 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -80,86 +263,84 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// create mkldnn memory from input tensors (data/weights) // Get unique name for storing MKLDNN primitives
auto user_src_memory = memory( const std::string key = ConvMKLDNNHandler::GetHash(
{{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, src_tz, weights_tz, strides, paddings, dilations, groups,
to_void_cast(input_data)); ctx.op().Output("Output"));
auto user_weights_memory = const std::string key_conv_pd = key + "@conv_pd";
memory({{{weights_tz}, memory::data_type::f32, filter->format()},
mkldnn_engine}, std::vector<primitive> pipeline;
to_void_cast(filter_data));
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance * the memory format preferred for best performance
*/ */
auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, auto src_md = platform::MKLDNNMemDesc(
memory::format::any); src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::f32, memory::format::any); weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, auto dst_md = platform::MKLDNNMemDesc(
memory::format::any); dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc( std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine);
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
// create reorder primitive if the input format is not the preferred one ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
auto src_memory = user_src_memory;
primitive reorder_src;
bool is_src_reordered = false;
if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
user_src_memory.get_primitive_desc()) {
src_memory = memory(conv_pd->src_primitive_desc());
reorder_src = reorder(user_src_memory, src_memory);
is_src_reordered = true;
}
auto weights_memory = user_weights_memory;
primitive reorder_weights;
bool is_weights_reordered = false;
if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
user_weights_memory.get_primitive_desc()) {
weights_memory = memory(conv_pd->weights_primitive_desc());
reorder_weights = reorder(user_weights_memory, weights_memory);
is_weights_reordered = true;
}
// create memory primitive for conv dst // create mkldnn memory from input tensors (data/weights)
auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data));
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline);
auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
// create convolution op primitive // create convolution op primitive
auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
std::vector<primitive> pipeline; pipeline.push_back(*conv_p);
if (is_src_reordered) pipeline.push_back(reorder_src);
if (is_weights_reordered) pipeline.push_back(reorder_weights);
pipeline.push_back(conv_prim);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(dst_memory)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} }
private: private:
std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc( std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine) const { const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = auto conv_desc = mkldnn::convolution_forward::desc(
conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
src, weights, dst, stride_dims, padding_dims, dst, stride_dims, padding_dims, padding_dims,
padding_dims, mkldnn::padding_kind::zero); mkldnn::padding_kind::zero);
auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd); return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
} }
}; };
...@@ -197,13 +378,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -197,13 +378,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
if (!input_grad && !filter_grad) return; if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
...@@ -223,146 +401,116 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -223,146 +401,116 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// create mkldnn memory from input tensors (input/weights/output_grad) // Get an unique name from "argument" name of "Output" variable
auto user_src_memory = memory( // as well as attributes of primitive to be created
{{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, // This name will be used as key when saving info into device context
to_void_cast(input_data)); const std::string key =
auto user_weights_memory = ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
memory({{{weights_tz}, memory::data_type::f32, filter->format()}, dilations, groups, ctx.op().Input("Output"));
mkldnn_engine},
to_void_cast(filter_data)); const std::string key_conv_pd = key + "@conv_pd";
auto user_diff_dst_memory = std::vector<primitive> pipeline;
memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
mkldnn_engine}, // Create user memory descriptors
to_void_cast(output_grad_data)); auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
auto user_diff_dst_md = platform::MKLDNNMemDesc(
{dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
/* create memory descriptor for conv backward without specified format /* create memory descriptor for conv backward without specified format
* ('any') which lets a primitive (conv backward in this case) choose * ('any') which lets a primitive (conv backward in this case) choose
* the memory format preferred for best performance * the memory format preferred for best performance
*/ */
auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, auto src_md = platform::MKLDNNMemDesc(
memory::format::any); src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, auto diff_src_md = platform::MKLDNNMemDesc(
memory::format::any); src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::f32, memory::format::any); weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto diff_weights_md = platform::MKLDNNMemDesc( auto diff_weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::f32, memory::format::any); weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, auto diff_dst_md = platform::MKLDNNMemDesc(
memory::format::any); dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
// Retrieve conv_pd from device context // Retrieve conv_pd from device context
auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>( auto conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
dev_ctx.GetBlob(key_conv_pd)); dev_ctx.GetBlob(key_conv_pd));
PADDLE_ENFORCE(conv_pd != nullptr, PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context"); "Fail to find conv_pd in device context");
// create backward conv primitive for weights // create backward convolution weights primitive descriptor
if (filter_grad) { auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
// create backward convolution primitive descriptor
auto conv_bwd_weights_desc = conv_bwd_weights::desc(
mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
strides, paddings, paddings, mkldnn::padding_kind::zero); strides, paddings, paddings, mkldnn::padding_kind::zero);
auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( auto conv_bwd_weights_pd =
std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
conv_bwd_weights_desc, mkldnn_engine, *conv_pd); conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
// create reorder primitive if the input format is not the preferred one // create backward convolution data primitive descriptor
auto src_memory = user_src_memory; auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
primitive reorder_src; mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
bool is_src_reordered = false; strides, paddings, paddings, mkldnn::padding_kind::zero);
if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != auto conv_bwd_data_pd =
user_src_memory.get_primitive_desc()) { std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); conv_bwd_data_desc, mkldnn_engine, *conv_pd);
reorder_src = reorder(user_src_memory, src_memory);
is_src_reordered = true;
}
auto diff_dst_memory_4filter = user_diff_dst_memory; ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
primitive reorder_diff_dst_4filter; dev_ctx, mkldnn_engine, key);
bool is_diff_dst_reordered_4filter = false;
if (memory::primitive_desc(
conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory_4filter =
memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
reorder_diff_dst_4filter =
reorder(user_diff_dst_memory, diff_dst_memory_4filter);
is_diff_dst_reordered_4filter = true;
}
// create mkldnn memory for output (i.e. diff weights) // create mkldnn memory from input tensors (data/weights)
auto diff_weights_memory = auto user_src_memory_p =
memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
reinterpret_cast<void*>(filter_grad_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data));
auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
user_diff_dst_md, to_void_cast<T>(output_grad_data));
// create backward conv primitive for weights // create backward conv primitive for weights
auto conv_bwd_weights_prim = if (filter_grad) {
conv_bwd_weights(conv_bwd_weights_pd, src_memory, auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
diff_dst_memory_4filter, diff_weights_memory); user_src_memory_p, pipeline);
// push primitive and execute it auto diff_dst_memory_4filter_p =
std::vector<primitive> pipeline; handler.AcquireDiffDstMemoryFromWeightsPrimitive(
if (is_src_reordered) pipeline.push_back(reorder_src); user_diff_dst_memory_p, pipeline);
if (is_diff_dst_reordered_4filter)
pipeline.push_back(reorder_diff_dst_4filter); auto diff_weights_memory_p =
pipeline.push_back(conv_bwd_weights_prim); handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
stream(stream::kind::eager).submit(pipeline).wait(); reinterpret_cast<void*>(filter_grad_data));
auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(
src_memory_p, diff_dst_memory_4filter_p, diff_weights_memory_p);
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_bwd_weights_p);
filter_grad->set_layout(DataLayout::kMKLDNN); filter_grad->set_layout(DataLayout::kMKLDNN);
filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
} }
if (input_grad) { if (input_grad) {
// create backward convolution primitive descriptor auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
auto conv_bwd_data_desc = conv_bwd_data::desc( user_weights_memory_p, pipeline);
mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
strides, paddings, paddings, mkldnn::padding_kind::zero);
auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
conv_bwd_data_desc, mkldnn_engine, *conv_pd);
// create reorder primitive if the input format is not the preferred one
auto weights_memory = user_weights_memory;
primitive reorder_weights;
bool is_weights_reordered = false;
if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
user_weights_memory.get_primitive_desc()) {
weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
reorder_weights = reorder(user_weights_memory, weights_memory);
is_weights_reordered = true;
}
auto diff_dst_memory_4data = user_diff_dst_memory; auto diff_dst_memory_4data_p =
primitive reorder_diff_dst_4data; handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
bool is_diff_dst_reordered_4data = false; pipeline);
if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
user_diff_dst_memory.get_primitive_desc()) {
diff_dst_memory_4data =
memory(conv_bwd_data_pd.diff_dst_primitive_desc());
reorder_diff_dst_4data =
reorder(user_diff_dst_memory, diff_dst_memory_4data);
is_diff_dst_reordered_4data = true;
}
// create mkldnn memory for output (i.e. diff src) auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
reinterpret_cast<void*>(input_grad_data)); reinterpret_cast<void*>(input_grad_data));
// create backward conv primitive for data auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(
auto conv_bwd_data_prim = diff_dst_memory_4data_p, weights_memory_p, diff_src_memory_p);
conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
diff_src_memory);
// push primitive and execute it pipeline.push_back(*conv_bwd_data_p);
std::vector<primitive> pipeline;
if (is_weights_reordered) pipeline.push_back(reorder_weights);
if (is_diff_dst_reordered_4data)
pipeline.push_back(reorder_diff_dst_4data);
pipeline.push_back(conv_bwd_data_prim);
stream(stream::kind::eager).submit(pipeline).wait();
input_grad->set_layout(DataLayout::kMKLDNN); input_grad->set_layout(DataLayout::kMKLDNN);
input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
} }
stream(stream::kind::eager).submit(pipeline).wait();
} // Compute() } // Compute()
}; };
......
...@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -149,6 +149,13 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
"(float) " "(float) "
"Prior boxes center offset.") "Prior boxes center offset.")
.SetDefault(0.5); .SetDefault(0.5);
AddAttr<bool>(
"min_max_aspect_ratios_order",
"(bool) If set True, the output prior box is in order of"
"[min, max, aspect_ratios], which is consistent with Caffe."
"Please note, this order affects the weights order of convolution layer"
"followed by and does not affect the final detection results.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Prior box operator Prior box operator
Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
......
...@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, ...@@ -28,8 +28,8 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
const int im_width, const int as_num, const int im_width, const int as_num,
const T offset, const T step_width, const T offset, const T step_width,
const T step_height, const T* min_sizes, const T step_height, const T* min_sizes,
const T* max_sizes, const int min_num, const T* max_sizes, const int min_num, bool is_clip,
bool is_clip) { bool min_max_aspect_ratios_order) {
int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
int box_num = height * width * num_priors; int box_num = height * width * num_priors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
...@@ -44,6 +44,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, ...@@ -44,6 +44,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
T min_size = min_sizes[m]; T min_size = min_sizes[m];
if (max_sizes) { if (max_sizes) {
int s = p % (as_num + 1); int s = p % (as_num + 1);
if (!min_max_aspect_ratios_order) {
if (s < as_num) { if (s < as_num) {
T ar = aspect_ratios[s]; T ar = aspect_ratios[s];
bw = min_size * sqrt(ar) / 2.; bw = min_size * sqrt(ar) / 2.;
...@@ -53,6 +54,19 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, ...@@ -53,6 +54,19 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
bw = sqrt(min_size * max_size) / 2.; bw = sqrt(min_size * max_size) / 2.;
bh = bw; bh = bw;
} }
} else {
if (s == 0) {
bw = bh = min_size / 2.;
} else if (s == 1) {
T max_size = max_sizes[m];
bw = sqrt(min_size * max_size) / 2.;
bh = bw;
} else {
T ar = aspect_ratios[s - 1];
bw = min_size * sqrt(ar) / 2.;
bh = min_size / sqrt(ar) / 2.;
}
}
} else { } else {
int s = p % as_num; int s = p % as_num;
T ar = aspect_ratios[s]; T ar = aspect_ratios[s];
...@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -94,6 +108,8 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
auto variances = ctx.Attr<std::vector<float>>("variances"); auto variances = ctx.Attr<std::vector<float>>("variances");
auto flip = ctx.Attr<bool>("flip"); auto flip = ctx.Attr<bool>("flip");
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
auto min_max_aspect_ratios_order =
ctx.Attr<bool>("min_max_aspect_ratios_order");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
...@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> { ...@@ -149,7 +165,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
GenPriorBox<T><<<grid, block, 0, stream>>>( GenPriorBox<T><<<grid, block, 0, stream>>>(
boxes->data<T>(), r.data<T>(), height, width, im_height, im_width, boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
aspect_ratios.size(), offset, step_width, step_height, min.data<T>(), aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
max_data, min_num, clip); max_data, min_num, clip, min_max_aspect_ratios_order);
framework::Tensor v; framework::Tensor v;
framework::TensorFromVector(variances, ctx.device_context(), &v); framework::TensorFromVector(variances, ctx.device_context(), &v);
......
...@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -68,6 +68,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
auto variances = ctx.Attr<std::vector<float>>("variances"); auto variances = ctx.Attr<std::vector<float>>("variances");
auto flip = ctx.Attr<bool>("flip"); auto flip = ctx.Attr<bool>("flip");
auto clip = ctx.Attr<bool>("clip"); auto clip = ctx.Attr<bool>("clip");
auto min_max_aspect_ratios_order =
ctx.Attr<bool>("min_max_aspect_ratios_order");
std::vector<float> aspect_ratios; std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
...@@ -108,6 +110,38 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -108,6 +110,38 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
int idx = 0; int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
if (min_max_aspect_ratios_order) {
box_width = box_height = min_size / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
if (max_sizes.size() > 0) {
auto max_size = max_sizes[s];
// square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
// priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
float ar = aspect_ratios[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
} else {
// priors with different aspect ratios // priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) { for (size_t r = 0; r < aspect_ratios.size(); ++r) {
float ar = aspect_ratios[r]; float ar = aspect_ratios[r];
...@@ -132,6 +166,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -132,6 +166,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
} }
} }
} }
}
if (clip) { if (clip) {
platform::Transform<platform::CPUDeviceContext> trans; platform::Transform<platform::CPUDeviceContext> trans;
......
...@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> { ...@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
std::minstd_rand engine, std::minstd_rand engine,
std::vector<int>* inds) const { std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1); std::uniform_real_distribution<float> uniform(0, 1);
if (inds->size() > num) { const int64_t size = static_cast<int64_t>(inds->size());
for (int i = num; i < inds->size(); ++i) { if (size > num) {
for (int64_t i = num; i < size; ++i) {
int rng_ind = std::floor(uniform(engine) * i); int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num) if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind + offset, std::iter_swap(inds->begin() + rng_ind + offset,
......
...@@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req_count_++; req_count_++;
} }
void GRPCClient::Wait() { bool GRPCClient::Wait() {
std::unique_lock<std::mutex> lk(sync_mutex_); std::unique_lock<std::mutex> lk(sync_mutex_);
sync_cond_.wait(lk, [this] { return req_count_ == 0; }); sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
return ok_;
} }
void GRPCClient::Proceed() { void GRPCClient::Proceed() {
...@@ -297,6 +298,14 @@ void GRPCClient::Proceed() { ...@@ -297,6 +298,14 @@ void GRPCClient::Proceed() {
if (c->status_.ok()) { if (c->status_.ok()) {
VLOG(3) << c->var_h_.String() << " process"; VLOG(3) << c->var_h_.String() << " process";
c->Process(); c->Process();
} else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
LOG(ERROR) << c->var_h_.String()
<< " meets grpc error:" << c->status_.error_message();
{
std::lock_guard<std::mutex> lk(sync_mutex_);
ok_ = false;
}
sync_cond_.notify_all();
} else { } else {
LOG(FATAL) << c->var_h_.String() LOG(FATAL) << c->var_h_.String()
<< " meets grpc error:" << c->status_.error_message(); << " meets grpc error:" << c->status_.error_message();
......
...@@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { ...@@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class GRPCClient : public RPCClient { class GRPCClient : public RPCClient {
public: public:
GRPCClient() {} GRPCClient() : ok_(true) {}
virtual ~GRPCClient(); virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
...@@ -221,7 +221,7 @@ class GRPCClient : public RPCClient { ...@@ -221,7 +221,7 @@ class GRPCClient : public RPCClient {
void AsyncSendEndPass(const std::string& ep, void AsyncSendEndPass(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
void Wait() override; bool Wait() override;
void SendBeginPass() override; void SendBeginPass() override;
...@@ -247,6 +247,7 @@ class GRPCClient : public RPCClient { ...@@ -247,6 +247,7 @@ class GRPCClient : public RPCClient {
std::mutex sync_mutex_; std::mutex sync_mutex_;
std::condition_variable sync_cond_; std::condition_variable sync_cond_;
std::atomic<int64_t> req_count_{0}; std::atomic<int64_t> req_count_{0};
bool ok_;
// mutex for GetChannel thread safety // mutex for GetChannel thread safety
std::mutex chan_mutex_; std::mutex chan_mutex_;
......
...@@ -72,7 +72,7 @@ class RPCClient { ...@@ -72,7 +72,7 @@ class RPCClient {
virtual void SendBeginPass() = 0; virtual void SendBeginPass() = 0;
virtual void SendEndPass() = 0; virtual void SendEndPass() = 0;
virtual void Wait() = 0; virtual bool Wait() = 0;
template <typename T> template <typename T>
static RPCClient* GetInstance() { static RPCClient* GetInstance() {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fake_quantize_op.h"
#include <string>
namespace paddle {
namespace operators {
class FakeQuantizeOp : public framework::OperatorWithKernel {
public:
FakeQuantizeOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeQuantizeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FakeQuantizeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
"OutMovingScale(Out) of FakeQuantizeOp should not be null");
// if (ctx->HasInput("InMovingScale")) {
ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
//}
// if (ctx->HasInput("InScales")) {
PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
"OutScales(Out) of FakeQuantizeOp should not be null");
ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
// PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
// ctx->Outputs("OutScales")[0],
// "Mean and MeanOut should share the same memory");
//}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input tensor of scale operator.");
AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
.AsDispensable();
AddInput("InMovingScale", "Last scale, used in static quantization.")
.AsDispensable();
AddInput("InCurrentIter",
"Last iteration number, used in static quantization.")
.AsDispensable();
AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
AddOutput("OutScales",
"(Tensor) scale buffer, used in static quantization.")
.AsDispensable();
AddOutput("OutMovingScale", " Current scale");
AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
AddAttr<std::string>("quantize_type",
"(string, default abs_max)"
"The scaling tpe of the quantize operator.")
.SetDefault("abs_max");
AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int &bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddAttr<bool>("is_test", "").SetDefault(false);
AddComment(R"DOC(
FakeQuantize operator
quantize_type = abs_max:
$$scale = max(abs(x))$$
quantize_type = range_abs_max:
$$scale = max(max(abs(x)), history_abs_max)$$
quantize_type = moving_average_abs_max:
$$scale = 0.1*scale+0.9*new_abs_max)$$
$$Out = scale*X$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
fake_quantize,
ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
extern __shared__ T shared_max_data[];
if (gridDim.x > 1) {
shared_max_data[tid] = T(0);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T tmp = fabs(in[i]);
if (tmp > shared_max_data[tid]) {
shared_max_data[tid] = tmp;
}
}
} else {
if (bid < n) {
shared_max_data[tid] = fabs(in[bid]);
} else {
shared_max_data[tid] = T(0);
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
int length) {
float host_max;
int kNumTheads = 1024;
int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
framework::Tensor t;
float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
platform::CUDAPlace());
FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
ctx.stream()>>>(length, array, device_max);
FindAbsMaxKernel<
float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
gridDimx, device_max, device_max);
PADDLE_ENFORCE_EQ(
cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
cudaSuccess, "cudaMemcpy failed");
return host_max;
}
template <typename T>
__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
int* num_saturate, const T min,
const T max) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
extern __shared__ int shared_count[];
shared_count[tid] = 0;
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
if (in[i] > max) {
out[i] = max;
shared_count[tid] += 1;
} else if (in[i] < min) {
out[i] = min;
shared_count[tid] += 1;
} else {
out[i] = in[i];
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i) {
shared_count[tid] += shared_count[tid + i];
}
__syncthreads();
}
if (tid == 0) {
num_saturate[blockIdx.x] = shared_count[0];
}
}
template <typename T>
__global__ void ReduceKernel(const int n, const T* in, T* out) {
int tid = threadIdx.x;
extern __shared__ T shared_sum[];
if (tid < n) {
shared_sum[tid] = in[tid];
} else {
shared_sum[tid] = T(0);
}
__syncthreads();
// blockDim.x must >= n
for (int i = (n + 1) / 2; i > 0; i >>= 1) {
if (tid < i) {
shared_sum[tid] += shared_sum[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[0] = shared_sum[0];
}
}
template <typename T>
int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
const T* in, T* out, const T min, const T max) {
int host_num_saturate;
int kNumTheads = 1024;
int gridDimx = (n + kNumTheads - 1) / kNumTheads;
gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
framework::Tensor t;
int* device_num_saturate = t.mutable_data<int>(
framework::make_ddim({gridDimx}), platform::CUDAPlace());
ApplySaturateKernel<
T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
n, in, out, device_num_saturate, min, max);
ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
gridDimx, device_num_saturate, device_num_saturate);
PADDLE_ENFORCE_EQ(cudaSuccess,
cudaMemcpy(&host_num_saturate, device_num_saturate,
sizeof(int), cudaMemcpyDeviceToHost),
"cudaMemcpy failed");
return host_num_saturate;
}
template <typename DeviceContext, typename T>
class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
public:
T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
framework::Tensor* scale_list, framework::Tensor* out_scale,
const T& cur_scale, int window_size,
int current_iter) const {
T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
T remove_tmp = sl[current_iter];
sl[current_iter] = cur_scale;
T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
if (max_scale < cur_scale) {
max_scale = cur_scale;
} else if (fabs(remove_tmp - max_scale) < 1e-6) {
int size = (current_iter > window_size) ? window_size : current_iter;
max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
}
return max_scale;
}
T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
framework::Tensor* out_scale,
const T& cur_scale) const {
T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
return T(outs[0]);
}
virtual void Compute(const framework::ExecutionContext& context) const {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto& device_ctx = context.cuda_device_context();
auto* tensor = context.Output<framework::Tensor>("Out");
auto* in = context.Input<framework::Tensor>("X");
const bool is_test = context.Attr<bool>("is_test");
tensor->mutable_data<T>(in->place());
context.Output<framework::Tensor>("OutMovingScale")
->mutable_data<T>(
context.Input<framework::Tensor>("InMovingScale")->place());
auto quantize_type =
static_cast<std::string>(context.Attr<std::string>("quantize_type"));
if (quantize_type == std::string("range_abs_max")) {
context.Output<framework::Tensor>("OutScales")
->mutable_data<T>(
context.Input<framework::Tensor>("InScales")->place());
context.Output<framework::Tensor>("OutCurrentIter")
->mutable_data<T>(
context.Input<framework::Tensor>("InCurrentIter")->place());
}
T scale = T(1);
int window_size = context.Attr<int>("window_size");
T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
if (quantize_type == std::string("abs_max")) {
auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
auto& device_ctx = context.template device_context<DeviceContext>();
auto* scale_list = context.Output<framework::Tensor>("OutScales");
math::SetConstant<DeviceContext, T> scalar;
scale_list->mutable_data<T>(context.GetPlace());
scalar(device_ctx, scale_list, static_cast<T>(0));
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
iter->mutable_data<T>(context.GetPlace());
scalar(device_ctx, iter, static_cast<T>(0));
} else if (quantize_type == std::string("range_abs_max")) {
auto* moving_scale = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InMovingScale"));
if (is_test) {
scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
} else {
auto* it = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InCurrentIter"));
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
int* last_iter = it->mutable_data<int>(platform::CPUPlace());
int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
auto* scale_list = context.Output<framework::Tensor>("OutScales");
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
window_size, current_iter[0]);
(*current_iter) = (*last_iter) + 1;
}
} else if (quantize_type == std::string("moving_average_abs_max")) {
auto* moving_scale = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InMovingScale"));
if (is_test) {
scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
} else {
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
scale = FindMovingAverageAbsMmax(
const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
}
}
ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
tensor->mutable_data<T>(in->place()), -scale, scale);
scale = bin_cnt / scale;
auto& dev =
*context.template device_context<DeviceContext>().eigen_device();
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
eigen_out.device(dev) = (scale * eigen_in).round();
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(fake_quantize,
paddle::operators::FakeQuantizeCUDAKernel<
paddle::platform::CUDADeviceContext, float>,
paddle::operators::FakeQuantizeCUDAKernel<
paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
using platform::Transform;
template <typename DeviceContext, typename T>
class FakeQuantizeKernel : public framework::OpKernel<T> {
public:
T FindAbsMax(framework::Tensor* in, int n) const {
T* p = in->mutable_data<T>(platform::CPUPlace());
T abs_max = (T)0.00000001;
for (int i = 0; i < n; i++) {
T tmp = fabs(p[i]);
if (tmp > abs_max) abs_max = tmp;
}
return T(abs_max);
}
T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
const T& cur_scale, int window_size,
int current_iter) const {
T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
T remove_tmp = sl[current_iter];
sl[current_iter] = cur_scale;
T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
if (max_scale < cur_scale) {
max_scale = cur_scale;
} else if (fabs(remove_tmp - max_scale) < 1e-6) {
int size = (current_iter > window_size) ? window_size : current_iter;
max_scale = T(FindAbsMax(scale_list, size));
}
return max_scale;
}
T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
framework::Tensor* out_scale,
const T& cur_scale) const {
T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
return T(outs[0]);
}
virtual void Compute(const framework::ExecutionContext& context) const {
auto* tensor = context.Output<framework::Tensor>("Out");
auto* in = context.Input<framework::Tensor>("X");
const bool is_test = context.Attr<bool>("is_test");
tensor->mutable_data<T>(in->place());
auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
oms_tensor->mutable_data<T>(in->place());
auto quantize_type =
static_cast<std::string>(context.Attr<std::string>("quantize_type"));
if (quantize_type == std::string("range_abs_max")) {
auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
oss_tensor->mutable_data<T>(
context.Input<framework::Tensor>("InScales")->place());
auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
oci_tensor->mutable_data<T>(
context.Input<framework::Tensor>("InCurrentIter")->place());
}
T scale = static_cast<T>(1);
int window_size = context.Attr<int>("window_size");
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev =
*context.template device_context<DeviceContext>().eigen_device();
auto raw_in = framework::EigenVector<T>::Flatten(*in);
if (quantize_type == std::string("abs_max")) {
auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = scale_out(0);
auto& device_ctx = context.template device_context<DeviceContext>();
auto* scale_list = context.Output<framework::Tensor>("OutScales");
math::SetConstant<DeviceContext, T> scalar;
scale_list->mutable_data<T>(context.GetPlace());
scalar(device_ctx, scale_list, static_cast<T>(0));
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
iter->mutable_data<T>(context.GetPlace());
scalar(device_ctx, iter, static_cast<T>(0));
} else if (quantize_type == std::string("range_abs_max")) {
auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
if (is_test) {
scale = moving_scale->data<T>()[0];
} else {
auto* it = context.Input<framework::Tensor>("InCurrentIter");
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
const int* last_iter = it->data<int>();
int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
auto* scale_list = context.Output<framework::Tensor>("OutScales");
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
current_iter[0]);
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
(*current_iter) = (*last_iter) + 1;
}
} else if (quantize_type == std::string("moving_average_abs_max")) {
auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
if (is_test) {
scale = moving_scale->data<T>()[0];
} else {
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
scale = FindMovingAverageAbsMmax(
const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
}
}
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), in->data<T>(),
in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
ClipFunctor<T>(-scale, scale));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
}
};
} // namespace operators
} // namespace paddle
...@@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase { ...@@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase {
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "fetch barrier, ep: " << ep; VLOG(3) << "fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep); rpc_client->AsyncSendFetchBarrier(ep);
} }
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
#include <vector>
namespace paddle {
namespace operators {
/**
* Organize the classes into a binary tree. At each node, a sigmoid function
* is used to calculate the probability of belonging to the right branch.
* This idea is from "F. Morin, Y. Bengio (AISTATS 05):
* Hierarchical Probabilistic Neural Network Language Model."
*
* Here we uses a simple way of making the binary tree.
* Assuming the number of classes C = 6,
* The classes are organized as a binary tree in the following way:
*
* @code{.py}
* *-*-*- 2
* | | |- 3
* | |
* | |-*- 4
* | |- 5
* |
* |-*- 0
* |- 1
* @endcode
*
* where * indicates an internal node, and each leaf node represents a class.
* - Node 0 ... C-2 are internal nodes.
* - Node C-1 ... 2C-2 are leaf nodes.
* - Class c is represented by leaf node \f$c+C-1\f$.
*
* We assign an id for each node:
* - the id of root be 0.
* - the left child of a node i is 2*i+1.
* - the right child of a node i is 2*i+2.
*
* It's easy to see that:
* - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
* - the j-th level ancestor of node i is
* \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
* - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
*
*/
class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
"Output(PreOut) should not be null.");
const int64_t batch_size = ctx->GetInputDim("X")[0];
std::vector<int64_t> output_shape({batch_size, 1});
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.GetPlace());
}
};
template <typename AttrType>
class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, required) The input tensor with shape [N, D], "
"where N is the size of mini-batch, and D is the feature size.");
AddInput("W",
"(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D].");
AddInput("Label",
"(Tensor, required), The labels of training data. It's a"
"tensor with shape [N, 1].");
AddInput("Bias",
"(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1].");
AddOutput("Out",
"(Tensor, required) The output of hierarchical sigmoid operator."
"The shape is [N, 1].");
AddOutput("PreOut",
"(Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes.")
.AsIntermediate();
AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
.SetDefault(2);
AddComment(R"DOC(
The hierarchical sigmoid operator organize the classes into a binary tree.
At each node, a sigmoid function is used to calculate the probability of
belonging to the right branch. This idea is from
"F. Morin, Y. Bengio (AISTATS 05):
Hierarchical Probabilistic Neural Network Language Model."
)DOC");
}
};
class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PreOut"),
"Input(Preout) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
"Output(W@Grad should not be null.)");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
ops::HierarchicalSigmoidOpMaker<int>,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
REGISTER_OP_CPU_KERNEL(
hierarchical_sigmoid,
ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
double>);
REGISTER_OP_CPU_KERNEL(
hierarchical_sigmoid_grad,
ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
float>,
ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
using platform::Transform;
template <typename DeviceContext, typename T>
class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* w = ctx.Input<framework::Tensor>("W");
auto* label = ctx.Input<framework::Tensor>("Label");
auto* bias = ctx.Input<framework::Tensor>("Bias");
auto* out = ctx.Output<framework::Tensor>("Out");
auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
int64_t code_length = math::FindLastSet(num_classes - 1);
int64_t batch_size = in->dims()[0];
framework::Tensor sum;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto* pre_out_data = pre_out->mutable_data<T>(
framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
// Not all class(leaf) nodes' path lengths equal code_length, thus init as
// 0s can avoid out of path's loss.
math::SetConstant<DeviceContext, T> zero;
zero(dev_ctx, pre_out, static_cast<T>(0.0));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::RowwiseSum<DeviceContext, T> row_sum;
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
std::vector<int64_t> sum_dims({batch_size, 1UL});
sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
auto sum_mat = EigenMatrix<T>::From(sum);
out->mutable_data<T>(ctx.GetPlace());
auto out_mat = framework::EigenVector<T>::Flatten(*out);
if (bias) {
bit_code.Add(pre_out, *bias);
}
bit_code.Mul(pre_out, *w, *in);
// clip to [-40, 40]
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(*pre_out, out, static_cast<T>(-1));
// use softrelu to calculate cross entropy
pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
row_sum(dev_ctx, *pre_out, &sum);
// TODO(guosheng): Subtract the out of path's loss, since not all
// class(leaf) nodes' path lengths equal code_length. But it won't break the
// gradient check since both have the out of path's loss and will cancel out
// each other.
out_mat.device(place) = sum_mat + out_mat;
}
};
template <typename DeviceContext, typename T>
class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* w = ctx.Input<framework::Tensor>("W");
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
auto* bias_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
auto* label = ctx.Input<framework::Tensor>("Label");
auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor pre_out_grad;
pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
in_grad->mutable_data<T>(ctx.GetPlace());
w_grad->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> zero;
zero(dev_ctx, in_grad, static_cast<T>(0.0));
zero(dev_ctx, w_grad, static_cast<T>(0.0));
size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
// softrelu derivative
pre_out_grad_mat.device(place) =
static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
bit_code.Sub(&pre_out_grad); // the gradient of clip(w * x + b)
pre_out_grad_mat.device(place) =
pre_out_grad_mat * out_grad_mat.broadcast(bcast);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
bit_code.AddGrad(pre_out_grad, bias_grad);
}
bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
bit_code.MulGradError(pre_out_grad, *w, in_grad);
}
};
} // namespace operators
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/im2sequence_op.h" #include "paddle/fluid/operators/im2sequence_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel { ...@@ -28,27 +29,18 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
"Input(X) of Im2SequenceOp should not be null."); "Input(X) of Im2SequenceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of Im2SequenceOp op should not be null."); "Output(Out) of Im2SequenceOp op should not be null.");
auto in_dim = ctx->GetInputDim("X"); auto in_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(in_dim.size(), 4, PADDLE_ENFORCE_EQ(in_dim.size(), 4,
"Input(X) format must be 4D tensor, eg., NCHW."); "Input(X) format must be 4D tensor, eg., NCHW.");
int img_channels = in_dim[1];
auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels"); auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
auto strides = ctx->Attrs().Get<std::vector<int>>("strides"); auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int batch_size = in_dim[0]; ctx->SetOutputDim("Out",
int img_channels = in_dim[1]; {in_dim[0], img_channels * kernels[0] * kernels[1]});
int img_height = in_dim[2];
int img_width = in_dim[3];
int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
paddings[3], strides[1]);
ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
img_channels * kernels[0] * kernels[1]});
} }
}; };
...@@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -61,6 +53,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
"C: channels" "C: channels"
"H: height" "H: height"
"W: width"); "W: width");
AddInput("Y",
"(Tensor) The input tensor of image real size(H, W)."
"2-D with shape [batchsize, 2]")
.AsDispensable();
AddOutput("Out", "(LodTensor) The output data of im2sequence op,"); AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
AddAttr<std::vector<int>>("kernels", AddAttr<std::vector<int>>("kernels",
"(vector<int>), the " "(vector<int>), the "
...@@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -73,6 +69,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<int> default:{0, 0, 0, 0}), the " "(vector<int> default:{0, 0, 0, 0}), the "
"paddings(up_pad, left_pad, down_pad, right_pad)") "paddings(up_pad, left_pad, down_pad, right_pad)")
.SetDefault({0, 0, 0, 0}); .SetDefault({0, 0, 0, 0});
AddAttr<std::vector<int>>("out_stride",
"the attribute is valid only when input(Y)"
"is not NULL.this attribute represents the"
"scaling of the pic through the CNN"
"(vector<int> dedault:{1,1}),the out_stride"
" (out_stride_height, out_stride_width)")
.SetDefault({1, 1});
AddComment(R"DOC( AddComment(R"DOC(
This op uses kernels to scan images and converts these images to sequences. This op uses kernels to scan images and converts these images to sequences.
After expanding, The number of time steps are output_height * output_width After expanding, The number of time steps are output_height * output_width
...@@ -123,7 +126,7 @@ output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] ...@@ -123,7 +126,7 @@ output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.]
[ 7. 1. 7. 9. 2. 1. 3. 5.] [ 7. 1. 7. 9. 2. 1. 3. 5.]
[ 5. 7. 2. 4. 1. 3. 9. 0.] [ 5. 7. 2. 4. 1. 3. 9. 0.]
[ 7. 9. 4. 8. 3. 5. 0. 8.]] [ 7. 9. 4. 8. 3. 5. 0. 8.]]
output.dims = {8, 9} output.dims = {8, 8}
output.lod = [[0, 4, 8]] output.lod = [[0, 4, 8]]
)DOC"); )DOC");
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
...@@ -39,51 +40,108 @@ class Im2SequenceKernel : public framework::OpKernel<T> { ...@@ -39,51 +40,108 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* in = ctx.Input<Tensor>("X"); const Tensor* in = ctx.Input<Tensor>("X");
LoDTensor* out = ctx.Output<LoDTensor>("Out"); LoDTensor* out = ctx.Output<LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
// TODO(wanghaoshuang): Add layout checker after 'set_layout'
// being available for python API
// PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
// "Input(X) layout must be NCHW");
auto in_dim = in->dims(); auto in_dim = in->dims();
int batch_size = in_dim[0]; int batch_size = in_dim[0];
int img_channels = in_dim[1]; int img_channels = in_dim[1];
int img_height = in_dim[2]; int img_height = in_dim[2];
int img_width = in_dim[3]; int img_width = in_dim[3];
auto kernels = ctx.Attr<std::vector<int>>("kernels"); auto kernels = ctx.Attr<std::vector<int>>("kernels");
auto strides = ctx.Attr<std::vector<int>>("strides"); auto strides = ctx.Attr<std::vector<int>>("strides");
auto paddings = ctx.Attr<std::vector<int>>("paddings"); auto paddings = ctx.Attr<std::vector<int>>("paddings");
if (ctx.HasInput("Y") && batch_size > 1) {
const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
Tensor cpu_shape_tensor;
TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
std::vector<int> imgreal_h;
std::vector<int> imgreal_w;
std::vector<int> output_height;
std::vector<int> output_width;
int result = 0;
for (int i = 0; i < batch_size; i++) {
int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
int tmp_real_w =
static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
if (tmp_real_h % out_stride[0] == 0) {
tmp_real_h = tmp_real_h / out_stride[0];
} else {
tmp_real_h = tmp_real_h / out_stride[0] + 1;
}
if (tmp_real_w % out_stride[1] == 0) {
tmp_real_w = tmp_real_w / out_stride[1];
} else {
tmp_real_w = tmp_real_w / out_stride[1] + 1;
}
imgreal_h.push_back(tmp_real_h);
imgreal_w.push_back(tmp_real_w);
output_height.push_back(Im2SeqOutputSize(
imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
output_width.push_back(Im2SeqOutputSize(
imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
result += output_height[i] * output_width[i];
}
out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
ctx.GetPlace());
const std::vector<int> dilations({1, 1});
int offset_out = 0;
for (int i = 0; i < batch_size; i++) {
const Tensor src =
in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst = out->Slice(offset_out,
offset_out + output_height[i] * output_width[i])
.Resize({output_height[i], output_width[i],
img_channels, kernels[0], kernels[1]});
offset_out += output_height[i] * output_width[i];
math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
f(dev_ctx, src, dilations, strides, paddings, &dst);
}
framework::LoD lod(1);
lod[0].reserve(batch_size + 1);
int offset = 0;
lod[0].push_back(offset);
for (int i = 0; i < batch_size; ++i) {
offset += output_height[i] * output_width[i];
lod[0].push_back(offset);
}
out->set_lod(lod);
} else {
int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0], int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
paddings[2], strides[0]); paddings[2], strides[0]);
int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
paddings[3], strides[1]); paddings[3], strides[1]);
out->mutable_data<T>({batch_size * output_height * output_width,
img_channels * kernels[0] * kernels[1]},
ctx.GetPlace());
const std::vector<int> dilations({1, 1}); const std::vector<int> dilations({1, 1});
auto out_dims = out->dims(); auto out_dims = out->dims();
out->Resize({batch_size, out->numel() / batch_size}); out->Resize({batch_size, out->numel() / batch_size});
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
const Tensor src = const Tensor src =
in->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
Tensor dst = out->Slice(i, i + 1).Resize( Tensor dst =
{output_height, output_width, img_channels, kernels[0], kernels[1]}); out->Slice(i, i + 1).Resize({output_height, output_width,
img_channels, kernels[0], kernels[1]});
math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f; math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
f(dev_ctx, src, dilations, strides, paddings, &dst); f(dev_ctx, src, dilations, strides, paddings, &dst);
} }
out->Resize(out_dims); out->Resize(out_dims);
// set lod information
// TODO(wanghaoshuang): Move this to InferShape
framework::LoD lod(1); framework::LoD lod(1);
lod[0].reserve(batch_size + 1); lod[0].reserve(batch_size + 1);
for (int i = 0, offset = 0; i < batch_size + 1; ++i) { int offset = 0;
lod[0].push_back(offset); lod[0].push_back(offset);
for (int i = 0; i < batch_size; ++i) {
offset += output_height * output_width; offset += output_height * output_width;
lod[0].push_back(offset);
} }
out->set_lod(lod); out->set_lod(lod);
} }
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -61,6 +61,8 @@ static void ParallelExecuteBlocks( ...@@ -61,6 +61,8 @@ static void ParallelExecuteBlocks(
framework::Async([&executor, &prepared, &program, &scope, idx]() { framework::Async([&executor, &prepared, &program, &scope, idx]() {
int run_block = idx; // thread local int run_block = idx; // thread local
try { try {
VLOG(3) << "running server block: " << run_block
<< "pointer: " << prepared[run_block].get();
executor->RunPreparedContext(prepared[run_block].get(), scope); executor->RunPreparedContext(prepared[run_block].get(), scope);
} catch (const std::exception &e) { } catch (const std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
...@@ -107,12 +109,14 @@ void ListenAndServOp::RunSyncLoop( ...@@ -107,12 +109,14 @@ void ListenAndServOp::RunSyncLoop(
PADDLE_ENFORCE_GE(num_blocks, 2, PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks"); "server program should have at least 2 blocks");
std::vector<int> optimize_blocks_idx; // Prepare all the server block
for (auto blk : optimize_blocks) { std::vector<int> optimize_blocks_list;
optimize_blocks_idx.push_back(blk->ID()); for (size_t i = 1; i < program->Size(); ++i) {
optimize_blocks_list.push_back(i);
} }
auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx); auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
// Insert placeholder for block0 which holds current op itself. // Insert placeholder for block0 which holds current op itself,
// NOTE the first block in `optimize_prepared` should never be ran.
optimize_prepared.insert( optimize_prepared.insert(
optimize_prepared.begin(), optimize_prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr)); std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
......
...@@ -51,6 +51,7 @@ math_library(sequence_padding) ...@@ -51,6 +51,7 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function)
math_library(matrix_bit_code)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
......
...@@ -21,6 +21,10 @@ ...@@ -21,6 +21,10 @@
#include "paddle/fluid/platform/dynload/mklml.h" #include "paddle/fluid/platform/dynload/mklml.h"
#endif #endif
#ifdef PADDLE_WITH_LIBXSMM
#include <libxsmm.h>
#endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#endif #endif
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <limits>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -30,6 +31,12 @@ struct CBlas<float> { ...@@ -30,6 +31,12 @@ struct CBlas<float> {
platform::dynload::cblas_sgemm(args...); platform::dynload::cblas_sgemm(args...);
} }
#ifdef PADDLE_WITH_LIBXSMM
template <typename... ARGS>
static void SMM_GEMM(ARGS... args) {
libxsmm_sgemm(args...);
}
#endif
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
platform::dynload::cblas_saxpy(args...); platform::dynload::cblas_saxpy(args...);
...@@ -63,6 +70,12 @@ struct CBlas<double> { ...@@ -63,6 +70,12 @@ struct CBlas<double> {
platform::dynload::cblas_dgemm(args...); platform::dynload::cblas_dgemm(args...);
} }
#ifdef PADDLE_WITH_LIBXSMM
template <typename... ARGS>
static void SMM_GEMM(ARGS... args) {
libxsmm_dgemm(args...);
}
#endif
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
platform::dynload::cblas_daxpy(args...); platform::dynload::cblas_daxpy(args...);
...@@ -140,6 +153,9 @@ struct CBlas<double> { ...@@ -140,6 +153,9 @@ struct CBlas<double> {
template <> template <>
struct CBlas<platform::float16> { struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
static void SMM_GEMM(...) {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
}
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) { static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
...@@ -147,6 +163,33 @@ struct CBlas<platform::float16> { ...@@ -147,6 +163,33 @@ struct CBlas<platform::float16> {
#endif #endif
}; };
template <typename T>
inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
bool transb, const T &alpha, const T &beta) {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom
constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
std::abs<T>(alpha - static_cast<T>(1) >
std::numeric_limits<T>::epsilon()) ||
std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
return false;
} else {
return true;
}
#endif
return false;
}
template <>
inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
bool transa, bool transb,
const platform::float16 &alpha,
const platform::float16 &beta) {
return false;
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA, void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
...@@ -156,8 +199,21 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -156,8 +199,21 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
int lda = (transA == CblasNoTrans) ? K : M; int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K; int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N; int ldc = N;
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, #ifdef PADDLE_WITH_LIBXSMM
beta, C, ldc); if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) {
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
&beta, C, &ldc);
} else {
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
ldb, beta, C, ldc);
#ifdef PADDLE_WITH_LIBXSMM
}
#endif
} }
template <> template <>
......
...@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int col_height = col->dims()[3]; int col_height = col->dims()[3];
int col_width = col->dims()[4]; int col_width = col->dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
((dilation[0] * (filter_height - 1) + 1))) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
((dilation[1] * (filter_width - 1) + 1))) /
stride[1] +
1,
col_width,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
...@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int col_height = col->dims()[0]; int col_height = col->dims()[0];
int col_width = col->dims()[1]; int col_width = col->dims()[1];
PADDLE_ENFORCE_EQ(
(im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col->data<T>(); T* col_data = col->data<T>();
......
...@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int col_height = col->dims()[3]; int col_height = col->dims()[3];
int col_width = col->dims()[4]; int col_width = col->dims()[4];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(dilation[0] * (filter_height - 1) + 1)) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int num_outputs = im_channels * col_height * col_width; int num_outputs = im_channels * col_height * col_width;
int blocks = (num_outputs + 1024 - 1) / 1024; int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512; int block_x = 512;
...@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int col_height = col->dims()[0]; int col_height = col->dims()[0];
int col_width = col->dims()[1]; int col_width = col->dims()[1];
PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(dilation[0] * (filter_height - 1) + 1)) /
stride[0] +
1,
col_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int block_dim_x = 0; int block_dim_x = 0;
int block_dim_y = 0; int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) { if (filter_height <= 4 && filter_width <= 4) {
......
...@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> { ...@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
PADDLE_ENFORCE_EQ(in_dims.size(), 2U); PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
auto height = in_dims[0]; auto height = in_dims[0];
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size); PADDLE_ENFORCE_EQ(out->numel(), height);
T* out_buf = out->mutable_data<T>(out->place()); T* out_buf = out->mutable_data<T>(out->place());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
......
...@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) { ...@@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) {
EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99); EXPECT_EQ(input3_ptr[7], 99);
} }
#ifdef PADDLE_WITH_LIBXSMM
template <typename T>
void MklSmmCompare(int m, int n, int k) {
paddle::framework::Tensor mat_a;
paddle::framework::Tensor mat_b;
paddle::framework::Tensor mat_c_smm;
paddle::framework::Tensor mat_c_mkl;
auto* cpu_place = new paddle::platform::CPUPlace();
T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
T alpha = static_cast<T>(1);
T beta = static_cast<T>(0);
for (int i = 0; i < mat_a.numel(); ++i) {
A[i] = static_cast<T>(i);
}
for (int i = 0; i < mat_b.numel(); ++i) {
B[i] = static_cast<T>(i);
}
// lda,ldb,ldc follow RowMajor
int lda = k;
int ldb = n;
int ldc = n;
auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
const char transa = 'N';
const char transb = 'N';
paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
&alpha, B, &ldb, A, &lda, &beta,
CSMM, &ldc);
};
auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
CblasNoTrans, m, n, k, alpha, A,
lda, B, ldb, beta, CMKL, ldc);
};
smm();
mkl();
ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
for (int i = 0; i < mat_c_mkl.numel(); ++i) {
EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
}
}
TEST(math_function, gemm_mkl_vs_smm) {
MklSmmCompare<float>(1, 2, 3);
MklSmmCompare<double>(1, 2, 3);
MklSmmCompare<float>(3, 2, 1);
MklSmmCompare<double>(3, 2, 1);
MklSmmCompare<float>(3, 8, 5);
MklSmmCompare<double>(3, 8, 5);
}
#endif
TEST(math_function, gemm_trans_clbas) { TEST(math_function, gemm_trans_cblas) {
paddle::framework::Tensor input1; paddle::framework::Tensor input1;
paddle::framework::Tensor input2; paddle::framework::Tensor input2;
paddle::framework::Tensor input3; paddle::framework::Tensor input3;
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include <iostream>
namespace paddle {
namespace operators {
namespace math {
template <typename T>
void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
const framework::Tensor& vec) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat->dims()[0];
size_t width = tmat->dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
tmat->data<T>()[i * width + j] += vec.data<T>()[index];
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
framework::Tensor* vec) {
SimpleCodeTable code_table(num_classes_);
size_t batch_size = tmat.dims()[0];
size_t width = tmat.dims()[1];
for (size_t i = 0; i < batch_size; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
vec->data<T>()[index] += tmat.data<T>()[i * width + j];
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
framework::Tensor* sum, T scale_sum) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t o_width = tmat.dims()[1];
for (size_t i = 0; i < num_samples; ++i) {
T sm = static_cast<T>(0.0);
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) {
// calc_bit starts from right most bit, while data in tmat[i] is in the
// reverse order.
sm += tmat.data<T>()[i * o_width + j];
}
}
sum->data<T>()[i] = scale_sum * sm;
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
const framework::Tensor& weight,
const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat->dims()[0];
size_t tmat_width = tmat->dims()[1];
size_t input_width = input.dims()[1];
size_t weight_width = weight.dims()[1];
auto tmat_value = tmat->data<T>();
auto weight_value = weight.data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
T sum = static_cast<T>(0.0);
for (size_t k = 0; k < input_width; ++k) {
sum += weight_value[weight_width * index + k] *
input_value[input_width * i + k];
}
tmat_value[i * tmat_width + j] += sum;
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::Tensor* weight,
const framework::Tensor& input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1];
size_t weight_width = weight->dims()[1];
auto tmat_value = tmat.data<T>();
auto weight_value = weight->data<T>();
auto input_value = input.data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
for (size_t k = 0; k < input_width; ++k) {
weight_value[weight_width * index + k] +=
tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
}
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
const framework::Tensor& weight,
framework::Tensor* input) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat.dims()[0];
size_t tmat_width = tmat.dims()[1];
size_t input_width = input->dims()[1];
size_t weight_width = weight.dims()[1];
auto tmat_value = tmat.data<T>();
auto weight_value = weight.data<T>();
auto input_value = input->data<T>();
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
size_t index = code.calc_index(j);
for (size_t k = 0; k < input_width; ++k) {
input_value[input_width * i + k] +=
tmat_value[i * tmat_width + j] *
weight_value[weight_width * index + k];
}
}
}
}
template <typename T>
void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
SimpleCodeTable code_table(num_classes_);
size_t num_samples = tmat->dims()[0];
size_t o_width = tmat->dims()[1];
for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table(static_cast<size_t>(ids_[i]));
int code_length = code.get_length();
for (int j = 0; j < code_length; ++j) {
if (code.calc_bit(j)) {
tmat->data<T>()[i * o_width + j] -= 1;
}
}
}
}
template class MatrixBitCodeFunctor<float>;
template class MatrixBitCodeFunctor<double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
/**
* SimpleCodeTable class should support 3 functions:
*
* size_t size()
* return the number of ids
*
* int get_max_code_length()
* return the maximal code length
*
* SimpleCode operator()(size_t i)
* return the i-th code. Code class is descriebed below.
*
* SimpleCode class should support 3 functions:
*
* int get_length()
* return the length of the code
*
* size_t cal_index(int bit)
* bit ranges from 0 to get_length() - 1
* return the index for the (1+bit) level parent
*
* bool calc_bit(int bit)
* return true if the bit level parent is the right child of (1+bit) level
* parent
*
*/
/**
* return the 1-based index of the highest bit set
*
* for x > 0:
* \f[
* FindLastSet(x) = 1 + \floor*{\log_{2}x}
* \f]
*/
inline constexpr size_t FindLastSet(size_t x) {
return std::is_same<size_t, unsigned int>::value
? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
: (std::is_same<size_t, unsigned long>::value // NOLINT
? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
: (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
}
struct SimpleCode {
SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
* prefixes.
* Weight index is the prefixes of encoding, thus leave out the right most
* bit in calc_index.
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
inline int get_length() const { return FindLastSet(c_) - 1; }
private:
size_t c_;
};
struct SimpleCodeTable {
explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
SimpleCode operator()(size_t code) const {
return SimpleCode(code, num_classes_);
}
size_t size() const { return num_classes_; }
int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
private:
size_t num_classes_;
};
template <typename T>
class MatrixBitCodeFunctor {
public:
explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
: num_classes_(num_classes), ids_(ids) {}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
void Add(framework::Tensor* tmat, const framework::Tensor& vec);
/* For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
/* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/
void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
/* For j < code_length
tmat(i, j) -= bit(i, j)
*/
void Sub(framework::Tensor* tmat);
/* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/
void Mul(framework::Tensor* tmat, const framework::Tensor& weight,
const framework::Tensor& input);
/* For index(i, j) >= 0:
weight.row(index(i, j)) += tmat(i, j) * input.row(i)
*/
void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
const framework::Tensor& input);
/* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/
void MulGradError(const framework::Tensor& tmat,
const framework::Tensor& weight, framework::Tensor* input);
size_t num_classes_;
const int64_t* ids_;
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
} }
} }
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
}; };
......
...@@ -81,6 +81,15 @@ class BlockingQueue { ...@@ -81,6 +81,15 @@ class BlockingQueue {
} }
} }
void ReOpen() {
std::lock_guard<std::mutex> lock(mutex_);
closed_ = false;
std::deque<T> new_deque;
queue_.swap(new_deque);
send_cv_.notify_all();
receive_cv_.notify_all();
}
void Close() { void Close() {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
closed_ = true; closed_ = true;
......
...@@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader { ...@@ -23,7 +23,7 @@ class BatchReader : public framework::DecoratedReader {
BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size, BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
bool discard_leftover) bool discard_leftover)
: DecoratedReader(reader), : DecoratedReader(reader),
batch_size_(batch_size), batch_size_(static_cast<size_t>(batch_size)),
discard_leftover_(discard_leftover) { discard_leftover_(discard_leftover) {
buffer_.reserve(batch_size_); buffer_.reserve(batch_size_);
} }
...@@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader { ...@@ -31,7 +31,7 @@ class BatchReader : public framework::DecoratedReader {
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override; void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
private: private:
int batch_size_; size_t batch_size_;
bool discard_leftover_; bool discard_leftover_;
std::vector<std::vector<framework::LoDTensor>> buffer_; std::vector<std::vector<framework::LoDTensor>> buffer_;
}; };
...@@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -78,7 +78,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) { void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
buffer_.clear(); buffer_.clear();
buffer_.reserve(batch_size_); buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) { for (size_t i = 0; i < batch_size_; ++i) {
buffer_.push_back(std::vector<framework::LoDTensor>()); buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back()); reader_->ReadNext(&buffer_.back());
if (buffer_.back().empty()) { if (buffer_.back().empty()) {
...@@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) { ...@@ -95,9 +95,9 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
// if buffer_ is empty, the 'out' will return as an empty vector. // if buffer_ is empty, the 'out' will return as an empty vector.
return; return;
} }
int out_num = buffer_[0].size(); size_t out_num = buffer_[0].size();
out->reserve(out_num); out->reserve(out_num);
for (int j = 0; j < out_num; ++j) { for (size_t j = 0; j < out_num; ++j) {
// Merge shape and check date type // Merge shape and check date type
std::type_index batch_type = buffer_[0][j].type(); std::type_index batch_type = buffer_[0][j].type();
framework::DDim batch_shape = buffer_[0][j].dims(); framework::DDim batch_shape = buffer_[0][j].dims();
......
...@@ -27,19 +27,17 @@ class PyReader : public framework::FileReader { ...@@ -27,19 +27,17 @@ class PyReader : public framework::FileReader {
queue_ = queue; queue_ = queue;
} }
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success; bool success;
*out = queue_->Pop(&success); *out = queue_->Pop(&success);
if (!success) out->clear(); if (!success) out->clear();
} }
private: void Shutdown() override { queue_->Close(); }
void ShutdownImpl() override { /* TODO */
}
void StartImpl() override { /* TODO */ void Start() override { queue_->ReOpen(); }
}
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_; std::shared_ptr<LoDTensorBlockingQueue> queue_;
}; };
......
...@@ -58,12 +58,15 @@ class LoDTensorBlockingQueue { ...@@ -58,12 +58,15 @@ class LoDTensorBlockingQueue {
inline size_t Size() const { return queue_.Size(); } inline size_t Size() const { return queue_.Size(); }
inline void Close() { return queue_.Close(); } inline void ReOpen() { queue_.ReOpen(); }
inline void Close() { queue_.Close(); }
inline bool IsClosed() const { return queue_.IsClosed(); } inline bool IsClosed() const { return queue_.IsClosed(); }
private: private:
void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) { void CheckDims(
const std::vector<framework::LoDTensor>& lod_tensor_vec) const {
PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(), PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
"Expect input size is %d but found %s", dims_.size(), "Expect input size is %d but found %s", dims_.size(),
lod_tensor_vec.size()); lod_tensor_vec.size());
......
...@@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase {
rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
} }
if (sync_mode) { if (sync_mode) {
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
} }
}; };
......
...@@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase {
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
// need to wait before sending send_barrier message // need to wait before sending send_barrier message
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
if (sync_mode) { if (sync_mode) {
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep; VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
} }
}; };
......
...@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase { ...@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase {
} }
} }
if (sync_send) { if (sync_send) {
rpc_client->Wait(); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class SqueezeOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SqueezeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SqueezeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
// Check input tensor dims (<6) Eigen limit.
PADDLE_ENFORCE(x_dims.size() <= 6,
"Invalid dimnesions, the rank of Input(X) "
"should be in the range of [1, 6] (Eigen limit).");
const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
for (int a : axes) {
PADDLE_ENFORCE_LT(a, x_dims.size(),
"The squeeze axis should be less than input "
"tensor's rank.");
}
auto out_dims = GetOutputShape(axes, x_dims);
ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", "Out");
}
}
static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
const framework::DDim &in_dims) {
size_t num_squeeze_dims = squeeze_dims.size();
int cnt_squeezed_dims = 0;
bool should_squeeze[9] = {false};
// Determines number of dimensions of output tensor after squeeze.
// Mark and count the dimensions need to be squeezed
if (num_squeeze_dims == 0) {
for (int idx = 0; idx < in_dims.size(); ++idx) {
if (in_dims[idx] == 1) {
should_squeeze[idx] = true;
++cnt_squeezed_dims;
}
}
} else {
for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
: squeeze_dims[idx];
// Check current index, the upper limit has beed checked in line 36.
PADDLE_ENFORCE(current >= 0,
"Invalid axis, the negative axis is out of range.");
PADDLE_ENFORCE(in_dims[current] == 1,
"Invalid axis index, the axis that will be squeezed "
"should be equal to 1.");
if (!(should_squeeze[current])) {
++cnt_squeezed_dims;
}
should_squeeze[current] = true;
}
}
// Make output dimensions
std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
if (!should_squeeze[in_idx]) {
output_shape[out_idx++] = in_dims[in_idx];
}
}
return framework::make_ddim(output_shape);
}
};
class SqueezeOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
attrs["inplace"] = Attr<bool>("inplace");
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor). The input tensor of squeeze operator.");
AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
AddAttr<std::vector<int>>("axes",
"(std::vector<int>). List of integers,"
" indicating the dimensions to squeeze.")
.SetDefault({});
AddAttr<bool>("inplace",
"(default: false) Squeeze the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x).")
.SetDefault(false);
AddComment(R"DOC(
Squeeze Operator.
Remove single-dimensional entries from the shape of a tensor.
Takes a parameter axes with a list of axes to squeeze.
If axes is not provided, all the single dimensions will be removed from the shape.
If an axis is selected with shape entry not equal to one, an error is raised.
Examples:
Case 1:
Given
X.shape = (1, 3, 1, 5)
and
axes = [0]
we get:
Out.shape = (3, 1, 5)
Case 2:
Given
X.shape = (1, 3, 1, 5)
and
axes = []
we get:
Out.shape = (3, 5)
)DOC");
}
};
class SqueezeGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
context->SetOutputDim(framework::GradVarName("X"),
context->GetInputDim("X"));
context->ShareLoD("X", framework::GradVarName("X"));
}
};
class SqueezeGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = Attr<bool>("inplace");
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
// Tell linker to use reshape op
USE_OP(reshape);
namespace ops = paddle::operators;
REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
ops::SqueezeOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
...@@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -88,7 +88,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
input_format = memory::format::nc; input_format = memory::format::nc;
} }
for (int i = in_place ? 1 : 0; i < N; i++) { for (int i = 0; i < N; i++) {
PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(), PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
"all inputs must be all LoDTensors"); "all inputs must be all LoDTensors");
auto& input = in_vars[i]->Get<LoDTensor>(); auto& input = in_vars[i]->Get<LoDTensor>();
......
...@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel<T> {
#endif #endif
for (size_t i = 0; i < row; i++) { for (size_t i = 0; i < row; i++) {
std::vector<std::pair<T, size_t>> vec; std::vector<std::pair<T, size_t>> vec;
vec.reserve(col);
for (size_t j = 0; j < col; j++) { for (size_t j = 0; j < col; j++) {
vec.push_back(std::pair<T, size_t>(eg_input(i, j), j)); vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class UnsqueezeOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of UnsqueezeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of UnsqueezeOp should not be null.");
const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
const auto &x_dims = ctx->GetInputDim("X");
// Validity Check: input tensor dims (<6).
PADDLE_ENFORCE(x_dims.size() <= 6,
"Invalid dimensions, the rank of Input(X) "
"should be in the range of [1, 6] (Eigen limit)");
auto out_dims = GetOutputShape(axes, x_dims);
ctx->SetOutputDim("Out", out_dims);
if (x_dims[0] == out_dims[0]) {
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx->ShareLoD("X", "Out");
}
}
static framework::DDim GetOutputShape(const std::vector<int> unsqz_dims,
const framework::DDim &in_dims) {
int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
int cur_output_size = in_dims.size();
std::vector<int64_t> output_shape(output_size, 0);
// Validity Check: rank range.
PADDLE_ENFORCE(output_size <= 6,
"The output tensor's rank should be less than 6.");
for (int axis : unsqz_dims) {
int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
// Vaildity Check: the axis bound
PADDLE_ENFORCE(
cur >= 0 && cur <= cur_output_size,
"The unsqueeze dims must be within range of current rank.");
// Move old axis, and insert new axis
for (int i = cur_output_size; i >= cur; --i) {
if (output_shape[i] == 1) {
// Move axis
output_shape[i + 1] = 1;
output_shape[i] = 0;
}
}
output_shape[cur] = 1;
// Add the output size.
cur_output_size++;
}
// Make output shape
for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
if (output_shape[out_idx] == 0) {
output_shape[out_idx] = in_dims[in_idx++];
}
}
return framework::make_ddim(output_shape);
}
};
class UnsqueezeOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
attrs["inplace"] = Attr<bool>("inplace");
// Invoke Reshape op.
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor). The input tensor of unsqueeze operator.");
AddOutput("Out", "(Tensor). The output tensor of unsqueeze operator.");
AddAttr<std::vector<int>>("axes",
"(std::vector<int>). List of integers,"
" indicating the dimensions to be inserted")
.AddCustomChecker([](const std::vector<int> &axes) {
PADDLE_ENFORCE(!axes.empty(),
"Invalid axes, The unsqueeze axes is empty.");
// Validity Check: axes dims (<6).
PADDLE_ENFORCE(static_cast<int>(axes.size()) < 6,
"Invalid dimensions, dynamic dimensions should be "
"within [1, 6] dimensions (Eigen limit).");
// Validity Check: the range of unsqueeze aixs.
for (int axis : axes) {
PADDLE_ENFORCE(axis < 6,
"Invalid dimensions, input axis should be"
" within [1, 6] dimensions (Eigen limit).");
}
});
AddAttr<bool>(
"inplace",
"(default: false) Unsqueeze the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x).")
.SetDefault(false);
AddComment(R"DOC(
Unsqueeze Operator.
Insert single-dimensional entries to the shape of a tensor.
Takes one required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example:
Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
)DOC");
}
};
class UnsqueezeGradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", framework::GradVarName("X"));
}
};
class UnsqueezeGradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = Attr<bool>("inplace");
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators
} // namespace paddle
// Tell linker to use reshape op.
USE_OP(reshape);
namespace ops = paddle::operators;
REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
ops::UnsqueezeOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
ops::UnsqueezeGradInferShape);
...@@ -46,7 +46,7 @@ ENDIF() ...@@ -46,7 +46,7 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for # memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS malloc cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
cc_test(init_test SRCS init_test.cc DEPS device_context) cc_test(init_test SRCS init_test.cc DEPS device_context)
......
...@@ -222,15 +222,16 @@ class MKLDNNHandler { ...@@ -222,15 +222,16 @@ class MKLDNNHandler {
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) { const std::string& suffix) {
auto dims2str = [](const mkldnn::memory::dims& operand_dims) { return dims2str(operand_dims) + suffix;
};
protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) { for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-"; dstr += std::to_string(operand_dims[i]) + "-";
} }
return dstr; return dstr;
};
return dims2str(operand_dims) + suffix;
} }
protected: protected:
......
...@@ -145,14 +145,14 @@ void BindBlockDesc(pybind11::module *m) { ...@@ -145,14 +145,14 @@ void BindBlockDesc(pybind11::module *m) {
.def_property_readonly("id", &pd::BlockDesc::ID) .def_property_readonly("id", &pd::BlockDesc::ID)
.def_property_readonly("parent", &pd::BlockDesc::Parent) .def_property_readonly("parent", &pd::BlockDesc::Parent)
.def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID) .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
.def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID) .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
.def("append_op", &pd::BlockDesc::AppendOp, .def("append_op", &pd::BlockDesc::AppendOp,
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("prepend_op", &pd::BlockDesc::PrependOp, .def("_prepend_op", &pd::BlockDesc::PrependOp,
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("insert_op", &pd::BlockDesc::InsertOp, .def("_insert_op", &pd::BlockDesc::InsertOp,
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("remove_op", &pd::BlockDesc::RemoveOp) .def("_remove_op", &pd::BlockDesc::RemoveOp)
.def("var", .def("var",
[](pd::BlockDesc &self, pybind11::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
...@@ -165,7 +165,7 @@ void BindBlockDesc(pybind11::module *m) { ...@@ -165,7 +165,7 @@ void BindBlockDesc(pybind11::module *m) {
return self.HasVar(name); return self.HasVar(name);
}, },
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("rename_var", .def("_rename_var",
[](pd::BlockDesc &self, const pybind11::bytes &byte_name, [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
const pybind11::bytes &byte_name_new) { const pybind11::bytes &byte_name_new) {
std::string name = byte_name; std::string name = byte_name;
...@@ -189,7 +189,7 @@ void BindBlockDesc(pybind11::module *m) { ...@@ -189,7 +189,7 @@ void BindBlockDesc(pybind11::module *m) {
return self.FindVarRecursive(name); return self.FindVarRecursive(name);
}, },
pybind11::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("remove_var", .def("_remove_var",
[](pd::BlockDesc &self, pybind11::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.RemoveVar(name); return self.RemoveVar(name);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT // for call_once #include <mutex> // NOLINT // for call_once
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -66,6 +67,14 @@ bool IsCompiledWithCUDA() { ...@@ -66,6 +67,14 @@ bool IsCompiledWithCUDA() {
#endif #endif
} }
bool IsCompiledWithDIST() {
#ifdef PADDLE_WITH_DISTRIBUTE
return true;
#else
return false;
#endif
}
PYBIND11_PLUGIN(core) { PYBIND11_PLUGIN(core) {
py::module m("core", "C++ core of PaddlePaddle"); py::module m("core", "C++ core of PaddlePaddle");
...@@ -78,37 +87,37 @@ PYBIND11_PLUGIN(core) { ...@@ -78,37 +87,37 @@ PYBIND11_PLUGIN(core) {
py::class_<Tensor>(m, "Tensor", py::buffer_protocol()) py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer( .def_buffer(
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
.def("get_dims", .def("_get_dims",
[](const Tensor &self) { return vectorize(self.dims()); }) [](const Tensor &self) { return vectorize(self.dims()); })
.def("set_dims", .def("_set_dims",
[](Tensor &self, const std::vector<int64_t> &dim) { [](Tensor &self, const std::vector<int64_t> &dim) {
self.Resize(make_ddim(dim)); self.Resize(make_ddim(dim));
}) })
.def("set_layout", .def("_set_layout",
[](Tensor &self, const std::string &layout) { [](Tensor &self, const std::string &layout) {
self.set_layout(StringToDataLayout(layout)); self.set_layout(StringToDataLayout(layout));
}) })
.def("alloc_float", .def("_alloc_float",
[](Tensor &self, paddle::platform::CUDAPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("alloc_float", .def("_alloc_float",
[](Tensor &self, paddle::platform::CPUPlace &place) { [](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("alloc_int", .def("_alloc_int",
[](Tensor &self, paddle::platform::CPUPlace &place) { [](Tensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_int", .def("_alloc_int",
[](Tensor &self, paddle::platform::CUDAPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_int", .def("_alloc_int",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_float", .def("_alloc_float",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
...@@ -136,11 +145,11 @@ PYBIND11_PLUGIN(core) { ...@@ -136,11 +145,11 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>) .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>) .def("_set_float_element", TensorSetElement<float>)
.def("get_float_element", TensorGetElement<float>) .def("_get_float_element", TensorGetElement<float>)
.def("set_double_element", TensorSetElement<double>) .def("_set_double_element", TensorSetElement<double>)
.def("get_double_element", TensorGetElement<double>) .def("_get_double_element", TensorGetElement<double>)
.def("dtype", [](Tensor &self) { return ToDataType(self.type()); }); .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
py::class_<LoDTensor, Tensor>(m, "LoDTensor") py::class_<LoDTensor, Tensor>(m, "LoDTensor")
.def_buffer( .def_buffer(
...@@ -302,7 +311,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -302,7 +311,8 @@ All parameter, weight, gradient are variables in Paddle.
::paddle::operators::reader::LoDTensorBlockingQueue; ::paddle::operators::reader::LoDTensorBlockingQueue;
using LoDTensorBlockingQueueHolder = using LoDTensorBlockingQueueHolder =
::paddle::operators::reader::LoDTensorBlockingQueueHolder; ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "") py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
m, "LoDTensorBlockingQueue", "")
.def("push", .def("push",
[](LoDTensorBlockingQueue &self, [](LoDTensorBlockingQueue &self,
const std::vector<framework::LoDTensor> &lod_tensor_vec) { const std::vector<framework::LoDTensor> &lod_tensor_vec) {
...@@ -317,7 +327,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -317,7 +327,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_lod_tensor_blocking_queue", m.def("init_lod_tensor_blocking_queue",
[](Variable &var, size_t capacity, [](Variable &var, size_t capacity,
const std::vector<std::vector<int64_t>> &shapes) const std::vector<std::vector<int64_t>> &shapes)
-> LoDTensorBlockingQueue * { -> std::shared_ptr<LoDTensorBlockingQueue> {
std::vector<DDim> dims(shapes.size()); std::vector<DDim> dims(shapes.size());
std::transform(shapes.begin(), shapes.end(), dims.begin(), std::transform(shapes.begin(), shapes.end(), dims.begin(),
[](const std::vector<int64_t> &shape) { [](const std::vector<int64_t> &shape) {
...@@ -325,9 +335,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -325,9 +335,9 @@ All parameter, weight, gradient are variables in Paddle.
}); });
auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>(); auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
holder->InitOnce(capacity, dims); holder->InitOnce(capacity, dims);
return holder->GetQueue().get(); return holder->GetQueue();
}, },
py::return_value_policy::reference); py::return_value_policy::copy);
py::class_<Scope>(m, "Scope", "") py::class_<Scope>(m, "Scope", "")
.def("var", .def("var",
...@@ -508,6 +518,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -508,6 +518,7 @@ All parameter, weight, gradient are variables in Paddle.
[](bool init_p2p) { framework::InitDevices(init_p2p); }); [](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16 // Only GPUs with Compute Capability >= 53 support float16
...@@ -534,6 +545,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -534,6 +545,8 @@ All parameter, weight, gradient are variables in Paddle.
}); });
py::class_<LoDTensorArray>(m, "LoDTensorArray") py::class_<LoDTensorArray>(m, "LoDTensorArray")
.def("__init__",
[](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
.def("__getitem__", .def("__getitem__",
[](LoDTensorArray &self, size_t i) { return &self.at(i); }, [](LoDTensorArray &self, size_t i) { return &self.at(i); },
py::return_value_policy::reference) py::return_value_policy::reference)
...@@ -656,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -656,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>()) size_t>())
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) .def("bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
...@@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args, ...@@ -66,6 +66,17 @@ paddle_error paddle_arguments_get_value(paddle_arguments args,
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
uint64_t ID,
paddle_matrix mat) {
if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
auto a = castArg(args);
if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
m->mat = a->args[ID].in;
return kPD_NO_ERROR;
}
paddle_error paddle_arguments_get_ids(paddle_arguments args, paddle_error paddle_arguments_get_ids(paddle_arguments args,
uint64_t ID, uint64_t ID,
paddle_ivector ids) { paddle_ivector ids) {
......
...@@ -87,6 +87,18 @@ PD_API paddle_error paddle_arguments_get_value(paddle_arguments args, ...@@ -87,6 +87,18 @@ PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
uint64_t ID, uint64_t ID,
paddle_matrix mat); paddle_matrix mat);
/**
* @brief paddle_arguments_get_prob Get the prob matrix of beam search, which
* slot ID is `ID`
* @param [in] args arguments array
* @param [in] ID array index
* @param [out] mat matrix pointer
* @return paddle_error
*/
PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
uint64_t ID,
paddle_matrix mat);
/** /**
* @brief PDArgsGetIds Get the integer vector of one argument in array, which * @brief PDArgsGetIds Get the integer vector of one argument in array, which
* index is `ID`. * index is `ID`.
......
...@@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName, ...@@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName,
const std::string& funcName, const std::string& funcName,
const std::vector<std::string>& args) { const std::vector<std::string>& args) {
PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args); PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
#if PY_MAJOR_VERSION >= 3
Py_ssize_t str_size = 0u;
const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size);
return std::string(str, (size_t)str_size);
#else
return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
#endif // PY_MAJOR_VERSION >= 3
} }
PyObjectPtr createPythonClass( PyObjectPtr createPythonClass(
......
...@@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName, ...@@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
namespace py { namespace py {
PyObjectPtr import(const std::string& moduleName); PyObjectPtr import(const std::string& moduleName);
#if PY_MAJOR_VERSION >= 3
/**
* Cast a PyLong to int type T.
* @tparam T return type.
* @param [in] obj PyLong object.
* @param [out] ok status for casting. False if error occured. nullptr if user
* don't care is ok or not.
* @return The value of python object, or 0 if not ok.
*/
template <typename T>
T castInt(PyObject* obj, bool* ok = nullptr) {
// Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
// were unified to long since python3
if (PyLong_Check(obj)) {
if (ok) *ok = true;
return (T)PyLong_AsUnsignedLong(obj);
} else {
if (ok) *ok = false;
return (T)0;
}
}
// Convert PyAPI from 2.x to 3.x
#define PyString_FromString PyUnicode_FromString
#define PyString_AsString PyUnicode_AsUTF8
#else
/** /**
* Cast a PyLong or PyInt to int type T. * Cast a PyLong or PyInt to int type T.
* @tparam T return type. * @tparam T return type.
...@@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) { ...@@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
return (T)0; return (T)0;
} }
} }
#endif // PY_MAJOR_VERSION >= 3
/** /**
* Invoke repr of python object. * Invoke repr of python object.
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
# Utils # Utils
#================================================= #=================================================
set -ex
function print_usage() { function print_usage() {
echo -e "\n${RED}Usage${NONE}: echo -e "\n${RED}Usage${NONE}:
${BOLD}${SCRIPT_NAME}${NONE} [OPTION]" ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
...@@ -37,6 +39,7 @@ function print_usage() { ...@@ -37,6 +39,7 @@ function print_usage() {
${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
${BLUE}check_style${NONE}: run code style check ${BLUE}check_style${NONE}: run code style check
${BLUE}cicheck${NONE}: run CI tasks ${BLUE}cicheck${NONE}: run CI tasks
${BLUE}assert_api_not_changed${NONE}: check api compability
" "
} }
...@@ -78,6 +81,12 @@ function cmake_gen() { ...@@ -78,6 +81,12 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
elif [ "$1" == "cp35-cp35m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH}
export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH}
export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
fi fi
fi fi
...@@ -108,6 +117,7 @@ function cmake_gen() { ...@@ -108,6 +117,7 @@ function cmake_gen() {
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
-DPY_VERSION=${PY_VERSION:-2.7}
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
...@@ -136,7 +146,8 @@ EOF ...@@ -136,7 +146,8 @@ EOF
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \
-DPY_VERSION=${PY_VERSION:-2.7}
} }
function abort(){ function abort(){
...@@ -318,11 +329,22 @@ function assert_api_not_changed() { ...@@ -318,11 +329,22 @@ function assert_api_not_changed() {
virtualenv .env virtualenv .env
source .env/bin/activate source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl pip install ${PADDLE_ROOT}/build/python/dist/*whl
curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
> origin.spec
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate deactivate
API_CHANGE=`git diff --name-only HEAD^ | grep "paddle/fluid/API.spec" || true`
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# TODO: curl -H 'Authorization: token ${TOKEN}'
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have at least 2 approvals for the api change!"
exit 1
fi
fi
} }
...@@ -508,15 +530,27 @@ function gen_fluid_inference_lib() { ...@@ -508,15 +530,27 @@ function gen_fluid_inference_lib() {
Deploying fluid inference library ... Deploying fluid inference library ...
======================================== ========================================
EOF EOF
cmake .. -DWITH_DISTRIBUTE=OFF
make -j `nproc` inference_lib_dist make -j `nproc` inference_lib_dist
cd ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build
mv fluid_install_dir fluid cp -r fluid_install_dir fluid
tar -cf fluid.tgz fluid tar -cf fluid.tgz fluid
fi fi
} }
function test_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF
========================================
Testing fluid inference library ...
========================================
EOF
cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
fi
}
function main() { function main() {
set -e
local CMD=$1 local CMD=$1
init init
case $CMD in case $CMD in
...@@ -557,6 +591,7 @@ function main() { ...@@ -557,6 +591,7 @@ function main() {
fluid_inference_lib) fluid_inference_lib)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
gen_fluid_inference_lib gen_fluid_inference_lib
test_fluid_inference_lib
;; ;;
check_style) check_style)
check_style check_style
...@@ -568,6 +603,7 @@ function main() { ...@@ -568,6 +603,7 @@ function main() {
run_test run_test
gen_capi_package gen_capi_package
gen_fluid_inference_lib gen_fluid_inference_lib
test_fluid_inference_lib
;; ;;
*) *)
print_usage print_usage
......
...@@ -92,8 +92,15 @@ install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} ...@@ -92,8 +92,15 @@ install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
) )
find_program(PATCHELF_EXECUTABLE patchelf) if(APPLE)
if(NOT PATCHELF_EXECUTABLE) find_program(INSTALL_NAME_TOOL_EXECUTABLE install_name_tool)
if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
message(FATAL_ERROR "install_name_tool not found, please check.\n")
endif()
else(APPLE)
find_program(PATCHELF_EXECUTABLE patchelf)
if(NOT PATCHELF_EXECUTABLE)
message(FATAL_ERROR "patchelf not found, please install it.\n" message(FATAL_ERROR "patchelf not found, please install it.\n"
"For Ubuntu, the command is: apt-get install -y patchelf.") "For Ubuntu, the command is: apt-get install -y patchelf.")
endif() endif()
endif(APPLE)
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
try: try:
from version import full_version as __version__ from paddle.version import full_version as __version__
from version import commit as __git_commit__ from paddle.version import commit as __git_commit__
except ImportError: except ImportError:
import sys import sys
...@@ -21,7 +21,7 @@ except ImportError: ...@@ -21,7 +21,7 @@ except ImportError:
import paddle from the source directory; please install paddlepaddle*.whl firstly.''' import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
) )
import reader import paddle.reader
import dataset import paddle.dataset
import batch import paddle.batch
batch = batch.batch batch = batch.batch
...@@ -15,20 +15,20 @@ ...@@ -15,20 +15,20 @@
Dataset package. Dataset package.
""" """
import mnist import paddle.dataset.mnist
import imikolov import paddle.dataset.imikolov
import imdb import paddle.dataset.imdb
import cifar import paddle.dataset.cifar
import movielens import paddle.dataset.movielens
import conll05 import paddle.dataset.conll05
import uci_housing import paddle.dataset.uci_housing
import sentiment import paddle.dataset.sentiment
import wmt14 import paddle.dataset.wmt14
import wmt16 import paddle.dataset.wmt16
import mq2007 import paddle.dataset.mq2007
import flowers import paddle.dataset.flowers
import voc2012 import paddle.dataset.voc2012
import image import paddle.dataset.image
__all__ = [ __all__ = [
'mnist', 'mnist',
......
...@@ -44,9 +44,9 @@ import metrics ...@@ -44,9 +44,9 @@ import metrics
import transpiler import transpiler
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from transpiler import DistributeTranspiler, InferenceTranspiler, \ from transpiler import DistributeTranspiler, InferenceTranspiler, \
memory_optimize, release_memory memory_optimize, release_memory, DistributeTranspilerConfig
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close, Select) channel_close, Select)
from lod_tensor import create_lod_tensor, create_random_int_lodtensor from lod_tensor import create_lod_tensor, create_random_int_lodtensor
...@@ -56,6 +56,7 @@ import unique_name ...@@ -56,6 +56,7 @@ import unique_name
import recordio_writer import recordio_writer
import parallel_executor import parallel_executor
from parallel_executor import * from parallel_executor import *
from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor Tensor = LoDTensor
...@@ -65,13 +66,14 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ ...@@ -65,13 +66,14 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
'io', 'io',
'initializer', 'initializer',
'layers', 'layers',
'transpiler' 'transpiler',
'nets', 'nets',
'optimizer', 'optimizer',
'learning_rate_decay', 'learning_rate_decay',
'backward', 'backward',
'regularizer', 'regularizer',
'LoDTensor', 'LoDTensor',
'LoDTensorArray',
'CPUPlace', 'CPUPlace',
'CUDAPlace', 'CUDAPlace',
'CUDAPinnedPlace', 'CUDAPinnedPlace',
...@@ -121,6 +123,9 @@ def __bootstrap__(): ...@@ -121,6 +123,9 @@ def __bootstrap__():
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem' 'init_allocated_mem'
] ]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic' 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
...@@ -134,5 +139,5 @@ def __bootstrap__(): ...@@ -134,5 +139,5 @@ def __bootstrap__():
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py. # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
# Consider paddle.init(args) or paddle.main(args) # Consider paddle.init(args) or paddle.main(args)
layers.monkey_patch_variable() monkey_patch_variable()
__bootstrap__() __bootstrap__()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,6 +11,28 @@ ...@@ -11,6 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#
add_subdirectory(inference) import functools
import sys
__all__ = ['deprecated']
def deprecated(since, instead, extra_message=""):
def decorator(func):
err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
func.__name__, since, instead)
if len(extra_message) != 0:
err_msg += "\n"
err_msg += extra_message
@functools.wraps(func)
def wrapper(*args, **kwargs):
print >> sys.stderr, err_msg
return func(*args, **kwargs)
wrapper.__doc__ += "\n "
wrapper.__doc__ += err_msg
return wrapper
return decorator
...@@ -18,10 +18,7 @@ import collections ...@@ -18,10 +18,7 @@ import collections
import copy import copy
import unique_name import unique_name
__all__ = [ __all__ = ['append_backward']
'append_backward',
'calc_gradient',
]
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
...@@ -123,7 +120,8 @@ def _append_grad_suffix_(name): ...@@ -123,7 +120,8 @@ def _append_grad_suffix_(name):
def _addup_repetitive_outputs_(op_descs): def _addup_repetitive_outputs_(op_descs):
""" """
In backward part, an variable may be the output of more than one ops. In backward part, an variable may be the output of more than one ops.
In this case, the variable should be the accumulation of all the outputs. And one op may yield its multiple outputs to the same variable.
In these cases, the variable should be the accumulation of all the outputs.
`sum_op`s are added to implement the accumulate. `sum_op`s are added to implement the accumulate.
""" """
pending_sum_ops = [] pending_sum_ops = []
...@@ -136,7 +134,9 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -136,7 +134,9 @@ def _addup_repetitive_outputs_(op_descs):
"sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]}, "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
{"use_mkldnn": False}), idx)) {"use_mkldnn": False}), idx))
renamed_vars[var_name] = [var_name] renamed_vars[var_name] = [var_name]
for var_name in op_desc.output_arg_names(): for param_idx, param_name in enumerate(op_desc.output_names()):
arg_names = op_desc.output(param_name)
for arg_idx, var_name in enumerate(arg_names):
if var_name == core.empty_var_name( if var_name == core.empty_var_name(
) or var_name in op_desc.input_arg_names(): ) or var_name in op_desc.input_arg_names():
# empty variable or inplace op # empty variable or inplace op
...@@ -154,11 +154,26 @@ def _addup_repetitive_outputs_(op_descs): ...@@ -154,11 +154,26 @@ def _addup_repetitive_outputs_(op_descs):
_rename_arg_(op_descs, var_name, new_name, 0, idx) _rename_arg_(op_descs, var_name, new_name, 0, idx)
_rename_arg_(pending_sum_ops, var_name, new_name) _rename_arg_(pending_sum_ops, var_name, new_name)
for p in op_desc.output_names()[:param_idx]:
p_arg_names = op_desc.output(p)
if var_name in p_arg_names:
op_desc.set_output(p, [
new_name if x == var_name else x
for x in p_arg_names
])
arg_names = [
new_name if x == var_name else x
for x in arg_names[:arg_idx]
] + arg_names[arg_idx:]
new_name = var_name + "@RENAME@" + \ new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name]) str(var_rename_count[var_name])
var_rename_count[var_name] += 1 var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name) arg_names[arg_idx] = new_name
op_desc.set_output(param_name, arg_names)
renamed_vars[var_name].append(new_name) renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems(): for var_name, inputs in renamed_vars.iteritems():
if len(inputs) > 1: if len(inputs) > 1:
pending_sum_ops.append( pending_sum_ops.append(
...@@ -313,7 +328,7 @@ def _append_backward_ops_(block, ...@@ -313,7 +328,7 @@ def _append_backward_ops_(block,
if op.has_attr("sub_block"): if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block")) sub_block = program.block(op.block_attr("sub_block"))
grad_sub_block = program.create_block() grad_sub_block = program.create_block()
grad_sub_block.set_forward_block_idx(sub_block.idx) grad_sub_block._set_forward_block_idx(sub_block.idx)
cb = _callback_lookup_(op) cb = _callback_lookup_(op)
if cb is not None: if cb is not None:
if callbacks is None: if callbacks is None:
...@@ -556,7 +571,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -556,7 +571,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx program.current_block_idx = current_block_idx
program.sync_with_cpp() program._sync_with_cpp()
# FIXME(zcd): prevent loss.grad optimized by mem_opt. # FIXME(zcd): prevent loss.grad optimized by mem_opt.
loss.block.var(_append_grad_suffix_(loss.name)).persistable = True loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
...@@ -729,7 +744,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): ...@@ -729,7 +744,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
_rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map) _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
_append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map) _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
prog.sync_with_cpp() prog._sync_with_cpp()
grad_vars = [] grad_vars = []
for input_var in inputs: for input_var in inputs:
......
...@@ -31,7 +31,7 @@ class BaseErrorClipAttr(object): ...@@ -31,7 +31,7 @@ class BaseErrorClipAttr(object):
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
def append_clip_op(self, block, grad_name): def _append_clip_op(self, block, grad_name):
raise NotImplementedError() raise NotImplementedError()
...@@ -67,7 +67,7 @@ class ErrorClipByValue(BaseErrorClipAttr): ...@@ -67,7 +67,7 @@ class ErrorClipByValue(BaseErrorClipAttr):
def __str__(self): def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max) return "ByValue, min=%f, max=%f" % (self.min, self.max)
def append_clip_op(self, block, grad_name): def _append_clip_op(self, block, grad_name):
clip_op_desc = block.desc.append_op() clip_op_desc = block.desc.append_op()
clip_op_desc.set_type("clip") clip_op_desc.set_type("clip")
clip_op_desc.set_input("X", [grad_name]) clip_op_desc.set_input("X", [grad_name])
...@@ -82,7 +82,7 @@ def error_clip_callback(block, context): ...@@ -82,7 +82,7 @@ def error_clip_callback(block, context):
op_desc = block.desc.op(block.desc.op_size() - 1) op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n), for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()): op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n]) fwd_var = block._var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None) error_clip = getattr(fwd_var, "error_clip", None)
if not (error_clip is None or isinstance(error_clip, if not (error_clip is None or isinstance(error_clip,
BaseErrorClipAttr)): BaseErrorClipAttr)):
...@@ -90,17 +90,17 @@ def error_clip_callback(block, context): ...@@ -90,17 +90,17 @@ def error_clip_callback(block, context):
"Variable's error_clip should be an instance of BaseErrorClipAttr or None." "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
) )
if error_clip is not None: if error_clip is not None:
error_clip.append_clip_op(block, grad_n) error_clip._append_clip_op(block, grad_n)
class BaseGradientClipAttr(object): class BaseGradientClipAttr(object):
def __str__(self): def __str__(self):
raise NotImplementedError() raise NotImplementedError()
def process_context(self, context, param, grad): def _process_context(self, context, param, grad):
raise NotImplementedError() raise NotImplementedError()
def create_operators(self, param, grad): def _create_operators(self, param, grad):
raise NotImplementedError() raise NotImplementedError()
...@@ -108,10 +108,10 @@ class NullGradientClipAttr(BaseGradientClipAttr): ...@@ -108,10 +108,10 @@ class NullGradientClipAttr(BaseGradientClipAttr):
def __str__(self): def __str__(self):
return "Null" return "Null"
def process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass pass
def create_operators(self, param, grad): def _create_operators(self, param, grad):
return param, grad return param, grad
...@@ -153,10 +153,10 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -153,10 +153,10 @@ class GradientClipByValue(BaseGradientClipAttr):
def __str__(self): def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max) return "ByValue, min=%f, max=%f" % (self.min, self.max)
def process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass pass
def create_operators(self, param, grad): def _create_operators(self, param, grad):
new_grad = layers.clip(x=grad, min=self.min, max=self.max) new_grad = layers.clip(x=grad, min=self.min, max=self.max)
return param, new_grad return param, new_grad
...@@ -199,10 +199,10 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -199,10 +199,10 @@ class GradientClipByNorm(BaseGradientClipAttr):
def __str__(self): def __str__(self):
return "ByNorm, clip_norm=%f" % self.clip_norm return "ByNorm, clip_norm=%f" % self.clip_norm
def process_context(self, context, param, grad): def _process_context(self, context, param, grad):
pass pass
def create_operators(self, param, grad): def _create_operators(self, param, grad):
new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm) new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
return param, new_grad return param, new_grad
...@@ -257,7 +257,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -257,7 +257,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name, return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
self.clip_norm) self.clip_norm)
def process_context(self, context, param, grad): def _process_context(self, context, param, grad):
if self.group_name not in context: if self.group_name not in context:
context[self.group_name] = [] context[self.group_name] = []
context[self.group_name + "_clip_value"] = self.clip_norm context[self.group_name + "_clip_value"] = self.clip_norm
...@@ -274,7 +274,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -274,7 +274,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
self.context = context self.context = context
def create_operators(self, param, grad): def _create_operators(self, param, grad):
group_scale_name = self.group_name + "_scale" group_scale_name = self.group_name + "_scale"
if group_scale_name not in self.context: if group_scale_name not in self.context:
group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sums(input=self.context[self.group_name])
...@@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None): ...@@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None):
param.gradient_clip_attr = copy.deepcopy(clip) param.gradient_clip_attr = copy.deepcopy(clip)
def append_gradient_clip_ops(param_grad): def append_gradient_clip_ops(param_grads):
context = dict() context = dict()
for p, g in param_grad: for p, g in param_grads:
with p.block.program.optimized_guard(p): if g is None:
continue
with p.block.program.optimized_guard([p, g]):
clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
if clip_attr is None: if clip_attr is None:
clip_attr = NullGradientClipAttr() clip_attr = NullGradientClipAttr()
...@@ -336,12 +338,14 @@ def append_gradient_clip_ops(param_grad): ...@@ -336,12 +338,14 @@ def append_gradient_clip_ops(param_grad):
"clip attribute should be an instance of BaseGradientClipAttr" "clip attribute should be an instance of BaseGradientClipAttr"
) )
clip_attr.process_context(context=context, param=p, grad=g) clip_attr._process_context(context=context, param=p, grad=g)
res = [] res = []
for p, g in param_grad: for p, g in param_grads:
with p.block.program.optimized_guard(p): if g is None:
res.append(clip_attr.create_operators(param=p, grad=g)) continue
with p.block.program.optimized_guard([p, g]):
res.append(clip_attr._create_operators(param=p, grad=g))
return res return res
......
...@@ -69,8 +69,10 @@ class Go(BlockGuard): ...@@ -69,8 +69,10 @@ class Go(BlockGuard):
parent_block.append_op( parent_block.append_op(
type='go', type='go',
inputs={ inputs={
'X': 'X': [
[parent_block.var_recursive(x_name) for x_name in x_name_list] parent_block._var_recursive(x_name)
for x_name in x_name_list
]
}, },
outputs={}, outputs={},
attrs={'sub_block': go_block}) attrs={'sub_block': go_block})
...@@ -259,7 +261,7 @@ class Select(BlockGuard): ...@@ -259,7 +261,7 @@ class Select(BlockGuard):
if var_name in intermediate if var_name in intermediate
] ]
X = [select_block.var_recursive(x_name) for x_name in params] X = [select_block._var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block. # Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute) X.append(self.case_to_execute)
......
...@@ -309,7 +309,7 @@ class Executor(object): ...@@ -309,7 +309,7 @@ class Executor(object):
if not has_feed_operators(global_block, feed, feed_var_name): if not has_feed_operators(global_block, feed, feed_var_name):
for i, name in enumerate(feed): for i, name in enumerate(feed):
out = global_block.var(name) out = global_block.var(name)
global_block.prepend_op( global_block._prepend_op(
type='feed', type='feed',
inputs={'X': [feed_var]}, inputs={'X': [feed_var]},
outputs={'Out': [out]}, outputs={'Out': [out]},
......
...@@ -32,7 +32,6 @@ except Exception, e: ...@@ -32,7 +32,6 @@ except Exception, e:
import unique_name import unique_name
__all__ = [ __all__ = [
'Block',
'Variable', 'Variable',
'Program', 'Program',
'Operator', 'Operator',
...@@ -447,7 +446,7 @@ class Operator(object): ...@@ -447,7 +446,7 @@ class Operator(object):
Notes: Notes:
The constructor of operator should not be invoked directly. Use The constructor of operator should not be invoked directly. Use
Block.append_op or Block.prepend_op instead. Block.append_op or Block._prepend_op instead.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -870,7 +869,7 @@ class Block(object): ...@@ -870,7 +869,7 @@ class Block(object):
def forward_block_idx(self): def forward_block_idx(self):
return self.desc.get_forward_block_idx() return self.desc.get_forward_block_idx()
def set_forward_block_idx(self, idx): def _set_forward_block_idx(self, idx):
""" """
Set the forward block Idx. Set the forward block Idx.
...@@ -880,7 +879,7 @@ class Block(object): ...@@ -880,7 +879,7 @@ class Block(object):
Returns: Returns:
None None
""" """
self.desc.set_forward_block_idx(idx) self.desc._set_forward_block_idx(idx)
@property @property
def idx(self): def idx(self):
...@@ -909,7 +908,7 @@ class Block(object): ...@@ -909,7 +908,7 @@ class Block(object):
raise ValueError("var %s not in this block" % name) raise ValueError("var %s not in this block" % name)
return v return v
def var_recursive(self, name): def _var_recursive(self, name):
""" """
Get a Variable by name from this block recursively. Get a Variable by name from this block recursively.
...@@ -951,9 +950,9 @@ class Block(object): ...@@ -951,9 +950,9 @@ class Block(object):
raise ValueError("Var {0} is not found recursively".format(name)) raise ValueError("Var {0} is not found recursively".format(name))
def all_parameters(self): def all_parameters(self):
return list(self.iter_parameters()) return list(self._iter_parameters())
def iter_parameters(self): def _iter_parameters(self):
return (item[1] for item in self.vars.iteritems() return (item[1] for item in self.vars.iteritems()
if isinstance(item[1], Parameter)) if isinstance(item[1], Parameter))
...@@ -966,7 +965,7 @@ class Block(object): ...@@ -966,7 +965,7 @@ class Block(object):
def has_var(self, name): def has_var(self, name):
return name in self.vars return name in self.vars
def rename_var(self, name, new_name): def _rename_var(self, name, new_name):
""" """
Rename variable in vars and ops' inputs and outputs Rename variable in vars and ops' inputs and outputs
...@@ -1000,8 +999,8 @@ class Block(object): ...@@ -1000,8 +999,8 @@ class Block(object):
else: else:
raise ValueError("unsupported var type: %s", type(v)) raise ValueError("unsupported var type: %s", type(v))
orig_var_type = v.type orig_var_type = v.type
self.desc.rename_var(name, new_name) self.desc._rename_var(name, new_name)
# NOTE: v is destroyed by C++ after calling rename_var. # NOTE: v is destroyed by C++ after calling _rename_var.
d = self.desc.find_var(new_name) d = self.desc.find_var(new_name)
if var_type == "Parameter": if var_type == "Parameter":
var = Parameter( var = Parameter(
...@@ -1024,16 +1023,16 @@ class Block(object): ...@@ -1024,16 +1023,16 @@ class Block(object):
error_clip=error_clip, error_clip=error_clip,
stop_gradient=stop_gradient) stop_gradient=stop_gradient)
# rename the python side, sync_with_cpp will only add # rename the python side, _sync_with_cpp will only add
# new vars/ops to python side. # new vars/ops to python side.
self.vars[new_name] = var self.vars[new_name] = var
del self.vars[name] del self.vars[name]
self.sync_with_cpp() self._sync_with_cpp()
return var return var
def remove_var(self, name): def _remove_var(self, name):
self.sync_with_cpp() self._sync_with_cpp()
self.desc.remove_var(name) self.desc._remove_var(name)
del self.vars[name] del self.vars[name]
def create_parameter(self, *args, **kwargs): def create_parameter(self, *args, **kwargs):
...@@ -1055,7 +1054,7 @@ class Block(object): ...@@ -1055,7 +1054,7 @@ class Block(object):
self.ops.append(op) self.ops.append(op)
return op return op
def insert_op(self, index, *args, **kwargs): def _insert_op(self, index, *args, **kwargs):
""" """
Insert a Operator according to the giving arguments. Insert a Operator according to the giving arguments.
...@@ -1065,13 +1064,13 @@ class Block(object): ...@@ -1065,13 +1064,13 @@ class Block(object):
Returns: Returns:
Operator: the insert Operator. Operator: the insert Operator.
""" """
self.sync_with_cpp() self._sync_with_cpp()
op_desc = self.desc.insert_op(index) op_desc = self.desc._insert_op(index)
op = Operator(block=self, desc=op_desc, *args, **kwargs) op = Operator(block=self, desc=op_desc, *args, **kwargs)
self.ops.insert(index, op) self.ops.insert(index, op)
return op return op
def remove_op(self, index): def _remove_op(self, index):
""" """
Remove the specific position operator. Remove the specific position operator.
...@@ -1081,11 +1080,11 @@ class Block(object): ...@@ -1081,11 +1080,11 @@ class Block(object):
Returns: Returns:
None None
""" """
self.sync_with_cpp() self._sync_with_cpp()
self.desc.remove_op(index, index + 1) self.desc._remove_op(index, index + 1)
del self.ops[index] del self.ops[index]
def slice_ops(self, start, end): def _slice_ops(self, start, end):
""" """
Return the Operator between start and end. Return the Operator between start and end.
...@@ -1098,13 +1097,13 @@ class Block(object): ...@@ -1098,13 +1097,13 @@ class Block(object):
""" """
return self.ops[start:end] return self.ops[start:end]
def prepend_op(self, *args, **kwargs): def _prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc._prepend_op()
op = Operator(self, op_desc, *args, **kwargs) op = Operator(self, op_desc, *args, **kwargs)
self.ops.insert(0, op) self.ops.insert(0, op)
return op return op
def sync_with_cpp(self): def _sync_with_cpp(self):
""" """
Sync from the desc on the c++ end. This method is used to synchronize Sync from the desc on the c++ end. This method is used to synchronize
the c++ desc instance generated by backward. the c++ desc instance generated by backward.
...@@ -1170,7 +1169,7 @@ class Block(object): ...@@ -1170,7 +1169,7 @@ class Block(object):
for index in range(len(self.ops)): for index in range(len(self.ops)):
assert self.ops[index].desc == ops_in_cpp[index] assert self.ops[index].desc == ops_in_cpp[index]
def copy_param_info_from(self, other): def _copy_param_info_from(self, other):
""" """
Copy the information of parameters from the other block. Copy the information of parameters from the other block.
...@@ -1185,12 +1184,13 @@ class Block(object): ...@@ -1185,12 +1184,13 @@ class Block(object):
None None
""" """
if not isinstance(other, Block): if not isinstance(other, Block):
raise TypeError("copy_param_info_from should be invoked with Block") raise TypeError(
for p in other.iter_parameters(): "_copy_param_info_from should be invoked with Block")
for p in other._iter_parameters():
assert isinstance(p, Parameter) assert isinstance(p, Parameter)
v = self.vars.get(p.name, None) v = self.vars.get(p.name, None)
if v is None: if v is None:
raise ValueError("copy_param_info_from should be invoked with " raise ValueError("_copy_param_info_from should be invoked with "
"same topology") "same topology")
assert isinstance(v, Variable) assert isinstance(v, Variable)
new_p = Parameter( new_p = Parameter(
...@@ -1208,7 +1208,7 @@ class Block(object): ...@@ -1208,7 +1208,7 @@ class Block(object):
name=v.name) name=v.name)
self.vars[new_p.name] = new_p self.vars[new_p.name] = new_p
def clone_variable(self, var): def _clone_variable(self, var):
""" """
Clone a variable into current block. Clone a variable into current block.
...@@ -1319,7 +1319,7 @@ class Program(object): ...@@ -1319,7 +1319,7 @@ class Program(object):
self._op_role_var = [var_name] self._op_role_var = [var_name]
@contextlib.contextmanager @contextlib.contextmanager
def optimized_guard(self, var): def optimized_guard(self, param_and_grads):
""" """
A with guard to set :code:`Optimization` :code:`OpRole` and A with guard to set :code:`Optimization` :code:`OpRole` and
:code:`OpRoleVar` automatically. :code:`OpRoleVar` automatically.
...@@ -1327,17 +1327,20 @@ class Program(object): ...@@ -1327,17 +1327,20 @@ class Program(object):
Notes: This is a very low level API. Users should not use it directly. Notes: This is a very low level API. Users should not use it directly.
Args: Args:
var(Variable|str): The variable (name) to be optimized. param_and_grads(list): The variables (names) to be optimized.
Examples: Examples:
>>> p, g = backward(...) >>> p, g = backward(...)
>>> with program.optimized_guard(p): >>> with program.optimized_guard([p,g]):
>>> p = p - 0.001 * g >>> p = p - 0.001 * g
""" """
OpRole = core.op_proto_and_checker_maker.OpRole OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.Optimize self._current_role = OpRole.Optimize
self._op_role_var = [var.name if isinstance(var, Variable) else var] self._op_role_var = [
var.name if isinstance(var, Variable) else var
for var in param_and_grads
]
yield yield
self._op_role_var = [] self._op_role_var = []
self._current_role = OpRole.Forward self._current_role = OpRole.Forward
...@@ -1481,9 +1484,9 @@ class Program(object): ...@@ -1481,9 +1484,9 @@ class Program(object):
p = Program() p = Program()
p.desc = core.ProgramDesc(self.desc) p.desc = core.ProgramDesc(self.desc)
p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
p.sync_with_cpp() p._sync_with_cpp()
p.copy_param_info_from(self) p._copy_param_info_from(self)
p.copy_data_info_from(self) p.copy_data_info_from(self)
return p return p
...@@ -1533,7 +1536,7 @@ class Program(object): ...@@ -1533,7 +1536,7 @@ class Program(object):
res = Program() res = Program()
res.desc = core.prune(self.desc, targets_idx) res.desc = core.prune(self.desc, targets_idx)
res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
res.sync_with_cpp() res._sync_with_cpp()
return res return res
def inference_optimize(self): def inference_optimize(self):
...@@ -1559,7 +1562,7 @@ class Program(object): ...@@ -1559,7 +1562,7 @@ class Program(object):
if op.has_attr('is_test'): if op.has_attr('is_test'):
op.set_attr('is_test', True) op.set_attr('is_test', True)
res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())] res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
res.sync_with_cpp() res._sync_with_cpp()
return res return res
@staticmethod @staticmethod
...@@ -1579,7 +1582,7 @@ class Program(object): ...@@ -1579,7 +1582,7 @@ class Program(object):
p = Program() p = Program()
p.desc = core.ProgramDesc(binary_str) p.desc = core.ProgramDesc(binary_str)
p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())] p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
p.sync_with_cpp() p._sync_with_cpp()
return p return p
@property @property
...@@ -1659,7 +1662,7 @@ class Program(object): ...@@ -1659,7 +1662,7 @@ class Program(object):
""" """
self.current_block_idx = self.current_block().parent_idx self.current_block_idx = self.current_block().parent_idx
def sync_with_cpp(self): def _sync_with_cpp(self):
""" """
Synchronize Python instance to its binding C++ object instance. Synchronize Python instance to its binding C++ object instance.
If the program is modified in C++ space, this method should be invoked. If the program is modified in C++ space, this method should be invoked.
...@@ -1673,9 +1676,9 @@ class Program(object): ...@@ -1673,9 +1676,9 @@ class Program(object):
for block_idx in range(len(self.blocks), self.desc.num_blocks()): for block_idx in range(len(self.blocks), self.desc.num_blocks()):
self.blocks.append(Block(self, block_idx)) self.blocks.append(Block(self, block_idx))
for block in self.blocks: for block in self.blocks:
block.sync_with_cpp() block._sync_with_cpp()
def copy_param_info_from(self, other): def _copy_param_info_from(self, other):
""" """
Copy the information of parameters from other program. Copy the information of parameters from other program.
...@@ -1689,13 +1692,13 @@ class Program(object): ...@@ -1689,13 +1692,13 @@ class Program(object):
None None
""" """
if not isinstance(other, Program): if not isinstance(other, Program):
raise TypeError("copy_param_info_from should be invoked with " raise TypeError("_copy_param_info_from should be invoked with "
"Program") "Program")
if len(self.blocks) != len(other.blocks): if len(self.blocks) != len(other.blocks):
raise ValueError("copy_param_info_from should be invoked with two " raise ValueError("_copy_param_info_from should be invoked with two "
"program, with represent the same topology") "program, with represent the same topology")
self.global_block().copy_param_info_from(other.global_block()) self.global_block()._copy_param_info_from(other.global_block())
def copy_data_info_from(self, other): def copy_data_info_from(self, other):
""" """
...@@ -1711,11 +1714,11 @@ class Program(object): ...@@ -1711,11 +1714,11 @@ class Program(object):
None None
""" """
if not isinstance(other, Program): if not isinstance(other, Program):
raise TypeError("copy_param_info_from should be invoked with " raise TypeError("_copy_param_info_from should be invoked with "
"Program") "Program")
if len(self.blocks) != len(other.blocks): if len(self.blocks) != len(other.blocks):
raise ValueError("copy_param_info_from should be invoked with two " raise ValueError("_copy_param_info_from should be invoked with two "
"program, with represent the same topology") "program, with represent the same topology")
for var in other.global_block().vars.itervalues(): for var in other.global_block().vars.itervalues():
if var.is_data: if var.is_data:
......
...@@ -148,7 +148,7 @@ class ConstantInitializer(Initializer): ...@@ -148,7 +148,7 @@ class ConstantInitializer(Initializer):
assert isinstance(var, framework.Variable) assert isinstance(var, framework.Variable)
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
op = block.prepend_op( op = block._prepend_op(
type="fill_constant", type="fill_constant",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -202,7 +202,7 @@ class UniformInitializer(Initializer): ...@@ -202,7 +202,7 @@ class UniformInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
op = block.prepend_op( op = block._prepend_op(
type="uniform_random", type="uniform_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -256,7 +256,7 @@ class NormalInitializer(Initializer): ...@@ -256,7 +256,7 @@ class NormalInitializer(Initializer):
# Initialization Ops should be prepended and not appended # Initialization Ops should be prepended and not appended
if self._seed == 0: if self._seed == 0:
self._seed = block.program.random_seed self._seed = block.program.random_seed
op = block.prepend_op( op = block._prepend_op(
type="gaussian_random", type="gaussian_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -346,7 +346,7 @@ class XavierInitializer(Initializer): ...@@ -346,7 +346,7 @@ class XavierInitializer(Initializer):
if self._uniform: if self._uniform:
limit = np.sqrt(6.0 / float(fan_in + fan_out)) limit = np.sqrt(6.0 / float(fan_in + fan_out))
op = block.prepend_op( op = block._prepend_op(
type="uniform_random", type="uniform_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -359,7 +359,7 @@ class XavierInitializer(Initializer): ...@@ -359,7 +359,7 @@ class XavierInitializer(Initializer):
else: else:
std = np.sqrt(2.0 / float(fan_in + fan_out)) std = np.sqrt(2.0 / float(fan_in + fan_out))
op = block.prepend_op( op = block._prepend_op(
type="gaussian_random", type="gaussian_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -444,7 +444,7 @@ class MSRAInitializer(Initializer): ...@@ -444,7 +444,7 @@ class MSRAInitializer(Initializer):
if self._uniform: if self._uniform:
limit = np.sqrt(6.0 / float(fan_in)) limit = np.sqrt(6.0 / float(fan_in))
op = block.prepend_op( op = block._prepend_op(
type="uniform_random", type="uniform_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
...@@ -457,7 +457,7 @@ class MSRAInitializer(Initializer): ...@@ -457,7 +457,7 @@ class MSRAInitializer(Initializer):
else: else:
std = np.sqrt(2.0 / float(fan_in)) std = np.sqrt(2.0 / float(fan_in))
op = block.prepend_op( op = block._prepend_op(
type="gaussian_random", type="gaussian_random",
outputs={"Out": var}, outputs={"Out": var},
attrs={ attrs={
......
...@@ -24,10 +24,7 @@ from . import core ...@@ -24,10 +24,7 @@ from . import core
__all__ = [ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
'load_persistables', 'save_inference_model', 'load_inference_model', 'load_persistables', 'save_inference_model', 'load_inference_model',
'get_inference_program', 'save_checkpoint', 'load_checkpoint', 'get_inference_program'
'clean_checkpoint', 'load_persist_vars_without_grad',
'load_lookup_table_vars', 'save_persist_vars_without_grad',
'get_latest_checkpoint_serial'
] ]
...@@ -526,7 +523,7 @@ def prepend_feed_ops(inference_program, ...@@ -526,7 +523,7 @@ def prepend_feed_ops(inference_program,
for i, name in enumerate(feed_target_names): for i, name in enumerate(feed_target_names):
out = global_block.var(name) out = global_block.var(name)
global_block.prepend_op( global_block._prepend_op(
type='feed', type='feed',
inputs={'X': [feed_var]}, inputs={'X': [feed_var]},
outputs={'Out': [out]}, outputs={'Out': [out]},
...@@ -628,7 +625,7 @@ def save_inference_model(dirname, ...@@ -628,7 +625,7 @@ def save_inference_model(dirname,
for i, op in enumerate(global_block.ops): for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False) op.desc.set_is_target(False)
if op.type == "feed" or op.type == "fetch": if op.type == "feed" or op.type == "fetch":
global_block.remove_op(i) global_block._remove_op(i)
copy_program.desc.flush() copy_program.desc.flush()
pruned_program = copy_program.prune(targets=target_vars) pruned_program = copy_program.prune(targets=target_vars)
...@@ -794,588 +791,6 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -794,588 +791,6 @@ def get_parameter_value_by_name(name, executor, program=None):
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
SUCCESS_MARK_FILENAME = "_SUCCESS"
CHECKPOINT_PREFIX = "checkpoint"
MODEL_DIR = "__model__"
LOOKUP_TABLE_DIR = "__lookup_table__"
TRAINER_PREFIX = "trainer"
CHECKPOINT_SEPARATOR = "_"
def save_checkpoint(executor,
checkpoint_dir,
trainer_id,
trainer_args=None,
main_program=None,
max_num_checkpoints=3,
lookup_table=None,
ps_endpoint_list=None):
"""
This function filters out all checkpoint variables from the give
main_program and then saves these variables to the `checkpoint_dir`
directory.
In the training precess, we generally save a checkpoint in each
iteration. So there might be a lot of checkpoints in the
`checkpoint_dir`. To avoid them taking too much disk space, the
`max_num_checkpoints` are introduced to limit the total number of
checkpoints. If the number of existing checkpints is greater than
the `max_num_checkpoints`, oldest ones will be scroll deleted.
A variable is a checkpoint variable and will be saved if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for save checkpoint.
checkpoint_dir(str): The folder where to save checkpoints.
trainer_id(int): currect trainer id, if id is equal to 0, the trainer
is chief.
trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
and 'step_id'.
Defaut: None
main_program(Program|None): The program whose checkpoint variables will
be saved. If it is None, the default main program will be used.
max_num_checkpoints(int): The max number of total number of existing
checkpoints.
Default: 3
lookup_table(string|None): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list|None): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Returns:
None
Raises:
ValueError: If `checkpoint_dir` is None.
AssertionError: If `trainer_args` is not a dict.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
trainer_args = {"epoch_id": 200,
"step_id": 20} # just an example
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
fluid.io.save_checkpoint(executor=exe,
checkpoint_dir=path,
trainer_id=0,
trainer_args=trainer_args,
main_program=prog,
max_num_checkpoints=3,
lookup_table=table_name,
ps_endpoint_list = ps_endpoints)
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
assert checkpoint_dir
if trainer_args:
assert isinstance(trainer_args, dict)
is_chief = trainer_id == 0
_make_chekcpoint_dirs(checkpoint_dir)
serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
cur_dir = _get_serial_dir(checkpoint_dir, serial)
save_trainer_args(cur_dir, trainer_id, trainer_args)
if is_chief:
save_persist_vars_without_grad(executor, cur_dir, main_program)
if is_chief and lookup_table and ps_endpoint_list:
save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
ps_endpoint_list)
_scroll_delete(checkpoint_dir, max_num_checkpoints)
def load_checkpoint(executor, checkpoint_dir, serial, main_program):
"""
This function filters out all checkpoint variables from the give
main_program and then try to load these variables from the
`checkpoint_dir` directory.
In the training precess, we generally save a checkpoint in each
iteration. So there are more than one checkpoint in the
`checkpoint_dir` (each checkpoint has its own sub folder), use
`serial` to specify which serial of checkpoint you would like to
load.
A variable is a checkpoint variable and will be loaded if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for loading checkpoint.
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
main_program(Program): The program whose checkpoint variables will
be loaded.
Returns:
None
Raises:
ValueError: If `checkpoint_dir` is None.
ValueError: If `serial` is None or `serial` is less than 0.
ValueError: If `main_program` is None.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
serial=9, main_program=prog)
# In this example, `load_checkpoint` function
# will first filters out all checkpoint variables in the default
# main program, and then try to load these variables form the
# folder "./checkpoints/checkpoint_9/__model__".
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
if serial is None or serial < 0:
raise ValueError("'serial' should not be None or <0 ")
if main_program is None:
raise ValueError('main_program should not be None.')
cur_dir = _get_serial_dir(checkpoint_dir, serial)
load_persist_vars_without_grad(executor, cur_dir, main_program, True)
def clean_checkpoint(checkpoint_dir, delete_dir=False):
"""
clean the checkpoint dir, when the train exits normally,
the trainer will call clean_checkpoint to delete checkpoint directory saved before.
delete_dir only works when the directory is empty, otherwise, OSError is raised.
: param checkpoint_dir
: param delete_dir
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
_scroll_delete(checkpoint_dir, max_num_checkpoints=0)
if delete_dir and not os.listdir(checkpoint_dir):
os.rmdir(checkpoint_dir)
def load_persist_vars_without_grad(executor,
dirname,
program,
has_model_dir=False):
"""
This function filters out all checkpoint variables from the give
program and then trys to load these variables from the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for loading variables.
dirname(str): The directory path.
program(Program): The program whose checkpoint variables will
be loaded.
has_model_dir(bool): if True, the function loads variables
from a sub directory named '__model__'.
Default: False
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.load_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog, has_model_dir=True)
# In this example, `load_persist_vars_without_grad` function
# will first filters out all checkpoint variables in the default
# main program, and then trys to load these variables form the
# folder "./my_paddle_model/__model__".
"""
if has_model_dir:
dirname = _get_model_dir(dirname)
load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var,
filename=None)
def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
"""
The parameter server will load lookup table's local file in
selectedrows variable.
Args:
executor(Executor): The executor to run for loading persistable variables
dirname(str): The directory path
main_program(Program): Find the variable named table_name in main_program
pserver_id(int): the serial number in pserver_endpoints list
table_name(str): lookup table name
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
dirname = "./checkpoints/checkpoint_9/__model__"
prog = fluid.default_main_program()
pserver_id = 1
table_name = "share_w"
fluid.io.load_lookup_table_vars(executor=exe,
dirname=dirname, program=prog, pserver_id=pserver_id,
table_name=table_name)
"""
for var in program.list_vars():
if var.name == table_name:
lookup_table_var = var
break
assert lookup_table_var is not None
lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
load_prog = Program()
load_block = load_prog.global_block()
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [lookup_table_var]},
attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
executor.run(load_prog)
def save_persist_vars_without_grad(executor, dirname, program):
"""
This function filters out all checkpoint variables from the give
program and then save these variables to a sub-folder '__model__' of
the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for saving variables.
dirname(str): The directory path.
program(Program): The program whose checkpoint variables will
be saved.
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
fluid.io.save_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog)
# In this example, `save_persist_vars_without_grad` function
# will first filters out all checkpoint variables in the default
# main program, and then saves these variables to the folder
# "./my_paddle_model/__model__".
"""
cur_dir = _get_model_dir(dirname)
save_vars(
executor,
dirname=cur_dir,
main_program=program,
vars=None,
predicate=_is_checkpoint_var,
filename=None)
_write_success(cur_dir)
def save_pserver_vars_by_notify(executor, dirname, lookup_table,
ps_endpoint_list):
"""
This function will send checkpoint notify message from Trainer 0
to all the pservers.
The checkpoint notify message contains lookup table name,
the absolute path on pserver to save lookup_table.
Args:
executor(Executor): The executor to run for send checkpoint notify.
dirname(str): The folder where to save checkpoints.
lookup_table(string): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Return:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
fluid.io.save_pserver_vars_by_notify(executor=exe,
dirname=param_path, lookup_table=table_name,
ps_endpoint_list=ps_endpoints)
"""
cur_dir = _get_lookuptable_dir(dirname)
checkpoint_notify_program = Program()
checkpoint_notify_block = checkpoint_notify_program.global_block()
attrs = {}
attrs['epmap'] = ps_endpoint_list
attrs['dir'] = cur_dir
attrs['lookup_table'] = lookup_table
checkpoint_notify_block.append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
executor.run(checkpoint_notify_program)
def save_trainer_args(dirname, trainer_id, trainer_args):
assert isinstance(trainer_args, dict)
cur_dir = _get_trainer_dir(dirname, trainer_id)
for name, value in trainer_args.iteritems():
args_file = os.path.join(cur_dir, name)
with open(args_file, 'w') as f:
f.write(str(value))
_write_success(cur_dir)
def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
"""
trainer will load some args from it's independent directory,
such as epoch_id and step_id.
Args:
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
trainer_id(int): current trainer id.
trainer_args(list): list about load trainer args
Return:
None
Examples:
.. code-block:: python
param_path = "./checkpoint/"
serial = 7
trainer_id = 2
trainer_args = ["epoch_id", "step_id"]
fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
trainer_id=trainer_id, trainer_args=trainer_args)
"""
assert isinstance(trainer_args, list)
cur_dir = _get_serial_dir(checkpoint_dir, serial)
cur_dir = _get_trainer_dir(cur_dir, trainer_id)
ret_values = []
for arg in trainer_args:
cur_file = os.path.join(cur_dir, arg)
with open(cur_file, 'r') as f:
contents = f.read()
ret_values.append(contents.strip())
return ret_values
def _is_checkpoint_var(var):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
return var.persistable
def _make_chekcpoint_dirs(dirs):
"""
_make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
"""
assert dirs is not None
if os.path.isfile(dirs):
raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
if not os.path.isdir(dirs):
try:
os.makedirs(dirs)
except OSError as err:
if err.errno != errno.EEXIST:
raise err
def _get_dir_serial(dirname):
_, serial = dirname.split(CHECKPOINT_SEPARATOR)
try:
serial_num = int(serial)
except ValueError:
serial_num = -1
return serial_num
def _get_serial_dir(dirname, serial):
serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
serial_dir = os.path.join(dirname, serial_folder)
_make_chekcpoint_dirs(serial_dir)
return serial_dir
def _get_model_dir(dirname):
model_dir = os.path.join(dirname, MODEL_DIR)
_make_chekcpoint_dirs(model_dir)
return model_dir
def _get_lookuptable_dir(dirname):
lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
_make_chekcpoint_dirs(lookuptable_dir)
return lookuptable_dir
def _get_trainer_dir(dirname, trainer_id):
trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
trainer_dir = os.path.join(dirname, trainer_folder)
_make_chekcpoint_dirs(trainer_dir)
return trainer_dir
def _scroll_delete(dirname, max_num_checkpoints=3):
dirs = os.listdir(dirname)
serial_map = {}
for serial in dirs:
serial_num = _get_dir_serial(serial)
serial_map[serial_num] = serial
if len(serial_map.keys()) <= max_num_checkpoints:
return
serials = serial_map.keys()
serials.sort(reverse=True)
serials = serials[max_num_checkpoints:]
for serial in serials:
cur_dir = _get_serial_dir(dirname, serial)
try:
shutil.rmtree(cur_dir)
except OSError as err:
if err.errno != errno.ENOENT:
raise err
def _write_success(dirname):
"""
write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
: param dirname
"""
success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
with open(success_file, 'a') as f:
now = time.ctime()
f.write(now)
def get_latest_checkpoint_serial(checkpoint_dir):
"""
get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
: param checkpoint_dir
"""
if not checkpoint_dir:
return -1
def has_success(checkpoint_dir, cur_dir):
"""
is _SUCCESS in this dir
"""
serial = _get_dir_serial(cur_dir)
if serial == -1 or not os.path.isdir(
os.path.join(checkpoint_dir, cur_dir)):
return -1
success_path = os.path.join(
_get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
SUCCESS_MARK_FILENAME)
if os.path.isfile(success_path):
return serial
if not os.path.isdir(checkpoint_dir):
return -1
current_dir = -1
dirs = os.listdir(checkpoint_dir)
for cur_dir in dirs:
success_num = has_success(checkpoint_dir, cur_dir)
if success_num > current_dir:
current_dir = success_num
return current_dir
def get_test_program(filelist, program=None, startup_program=None): def get_test_program(filelist, program=None, startup_program=None):
""" """
Transpile current train program to a program to read test dataset Transpile current train program to a program to read test dataset
...@@ -1459,7 +874,7 @@ def get_test_program(filelist, program=None, startup_program=None): ...@@ -1459,7 +874,7 @@ def get_test_program(filelist, program=None, startup_program=None):
main_block = program.global_block() main_block = program.global_block()
for var in main_block.vars.values(): for var in main_block.vars.values():
if var.type == core.VarDesc.VarType.READER: if var.type == core.VarDesc.VarType.READER:
main_block.rename_var( main_block._rename_var(
str(var.name), str(_get_test_reader_name(var.name))) str(var.name), str(_get_test_reader_name(var.name)))
for op in main_block.ops: for op in main_block.ops:
...@@ -1468,7 +883,7 @@ def get_test_program(filelist, program=None, startup_program=None): ...@@ -1468,7 +883,7 @@ def get_test_program(filelist, program=None, startup_program=None):
if op.type == "create_multi_pass_reader": if op.type == "create_multi_pass_reader":
test_op.set_attr("pass_num", 1) test_op.set_attr("pass_num", 1)
startup_program.sync_with_cpp() startup_program._sync_with_cpp()
program.sync_with_cpp() program._sync_with_cpp()
return program return program
...@@ -68,11 +68,11 @@ class LayerHelper(object): ...@@ -68,11 +68,11 @@ class LayerHelper(object):
@property @property
def param_attr(self): def param_attr(self):
return ParamAttr.to_attr(self.kwargs.get('param_attr', None)) return ParamAttr._to_attr(self.kwargs.get('param_attr', None))
@property @property
def bias_attr(self): def bias_attr(self):
return ParamAttr.to_attr(self.kwargs.get('bias_attr', None)) return ParamAttr._to_attr(self.kwargs.get('bias_attr', None))
def multiple_param_attr(self, length): def multiple_param_attr(self, length):
param_attr = self.param_attr param_attr = self.param_attr
...@@ -262,11 +262,11 @@ class LayerHelper(object): ...@@ -262,11 +262,11 @@ class LayerHelper(object):
g_param = self.startup_program.global_block().create_parameter( g_param = self.startup_program.global_block().create_parameter(
dtype=dtype, dtype=dtype,
shape=g_param_shape, shape=g_param_shape,
**g_param_attr.to_kwargs(with_initializer=False)) **g_param_attr._to_kwargs(with_initializer=False))
v_param = self.startup_program.global_block().create_parameter( v_param = self.startup_program.global_block().create_parameter(
dtype=dtype, dtype=dtype,
shape=v_param_shape, shape=v_param_shape,
**v_param_attr.to_kwargs(with_initializer=True)) **v_param_attr._to_kwargs(with_initializer=True))
__norm_except_dim( __norm_except_dim(
x=v_param, x=v_param,
out=g_param, out=g_param,
...@@ -275,9 +275,9 @@ class LayerHelper(object): ...@@ -275,9 +275,9 @@ class LayerHelper(object):
# Add weight normalization to main_program # Add weight normalization to main_program
g_param = self.main_program.global_block().create_parameter( g_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs()) dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
v_param = self.main_program.global_block().create_parameter( v_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs()) dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
w_param = __weight_normalize(g_param, v_param, dim=attr.dim) w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
return w_param return w_param
...@@ -296,11 +296,11 @@ class LayerHelper(object): ...@@ -296,11 +296,11 @@ class LayerHelper(object):
if default_initializer is None and attr.initializer is None: if default_initializer is None and attr.initializer is None:
if is_bias: if is_bias:
attr.set_default_bias_initializer() attr._set_default_bias_initializer()
else: else:
attr.set_default_param_initializer() attr._set_default_param_initializer()
else: else:
attr.set_default_initializer(default_initializer) attr._set_default_initializer(default_initializer)
# If weight normalization is set, insert extra parameters and ops. # If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf # Refer to https://arxiv.org/pdf/1602.07868.pdf
...@@ -310,9 +310,9 @@ class LayerHelper(object): ...@@ -310,9 +310,9 @@ class LayerHelper(object):
return param return param
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True)) dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr.to_kwargs()) dtype=dtype, shape=shape, **attr._to_kwargs())
def get_parameter(self, name): def get_parameter(self, name):
param = self.main_program.global_block().var(name) param = self.main_program.global_block().var(name)
......
...@@ -33,7 +33,6 @@ from metric_op import * ...@@ -33,7 +33,6 @@ from metric_op import *
from learning_rate_scheduler import * from learning_rate_scheduler import *
__all__ = [] __all__ = []
__all__ += math_op_patch.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += io.__all__ __all__ += io.__all__
__all__ += tensor.__all__ __all__ += tensor.__all__
......
...@@ -730,8 +730,10 @@ class While(object): ...@@ -730,8 +730,10 @@ class While(object):
parent_block.append_op( parent_block.append_op(
type='while', type='while',
inputs={ inputs={
'X': 'X': [
[parent_block.var_recursive(x_name) for x_name in x_name_list], parent_block._var_recursive(x_name)
for x_name in x_name_list
],
'Condition': [self.cond_var] 'Condition': [self.cond_var]
}, },
outputs={'Out': out_vars, outputs={'Out': out_vars,
...@@ -1259,7 +1261,7 @@ class ConditionalBlock(object): ...@@ -1259,7 +1261,7 @@ class ConditionalBlock(object):
input_set = set([ipt.name for ipt in self.inputs]) input_set = set([ipt.name for ipt in self.inputs])
param_list = [ param_list = [
parent_block.var_recursive(each_name) for each_name in params parent_block._var_recursive(each_name) for each_name in params
if each_name not in input_set if each_name not in input_set
] ]
......
...@@ -789,7 +789,8 @@ def prior_box(input, ...@@ -789,7 +789,8 @@ def prior_box(input,
clip=False, clip=False,
steps=[0.0, 0.0], steps=[0.0, 0.0],
offset=0.5, offset=0.5,
name=None): name=None,
min_max_aspect_ratios_order=False):
""" """
**Prior Box Operator** **Prior Box Operator**
...@@ -818,6 +819,11 @@ def prior_box(input, ...@@ -818,6 +819,11 @@ def prior_box(input,
Default: [0., 0.] Default: [0., 0.]
offset(float): Prior boxes center offset. Default: 0.5 offset(float): Prior boxes center offset. Default: 0.5
name(str): Name of the prior box op. Default: None. name(str): Name of the prior box op. Default: None.
min_max_aspect_ratios_order(bool): If set True, the output prior box is
in order of [min, max, aspect_ratios], which is consistent with
Caffe. Please note, this order affects the weights order of
convolution layer followed by and does not affect the final
detection results. Default: False.
Returns: Returns:
tuple: A tuple with two Variable (boxes, variances) tuple: A tuple with two Variable (boxes, variances)
...@@ -871,7 +877,8 @@ def prior_box(input, ...@@ -871,7 +877,8 @@ def prior_box(input,
'clip': clip, 'clip': clip,
'step_w': steps[0], 'step_w': steps[0],
'step_h': steps[1], 'step_h': steps[1],
'offset': offset 'offset': offset,
'min_max_aspect_ratios_order': min_max_aspect_ratios_order
} }
if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0: if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
if not _is_list_or_tuple_(max_sizes): if not _is_list_or_tuple_(max_sizes):
...@@ -911,7 +918,8 @@ def multi_box_head(inputs, ...@@ -911,7 +918,8 @@ def multi_box_head(inputs,
kernel_size=1, kernel_size=1,
pad=0, pad=0,
stride=1, stride=1,
name=None): name=None,
min_max_aspect_ratios_order=False):
""" """
Generate prior boxes for SSD(Single Shot MultiBox Detector) Generate prior boxes for SSD(Single Shot MultiBox Detector)
algorithm. The details of this algorithm, please refer the algorithm. The details of this algorithm, please refer the
...@@ -954,6 +962,11 @@ def multi_box_head(inputs, ...@@ -954,6 +962,11 @@ def multi_box_head(inputs,
pad(int|list|tuple): The padding of conv2d. Default:0. pad(int|list|tuple): The padding of conv2d. Default:0.
stride(int|list|tuple): The stride of conv2d. Default:1, stride(int|list|tuple): The stride of conv2d. Default:1,
name(str): Name of the prior box layer. Default: None. name(str): Name of the prior box layer. Default: None.
min_max_aspect_ratios_order(bool): If set True, the output prior box is
in order of [min, max, aspect_ratios], which is consistent with
Caffe. Please note, this order affects the weights order of
convolution layer followed by and does not affect the fininal
detection results. Default: False.
Returns: Returns:
tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances) tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
...@@ -1068,7 +1081,8 @@ def multi_box_head(inputs, ...@@ -1068,7 +1081,8 @@ def multi_box_head(inputs,
step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0] step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
box, var = prior_box(input, image, min_size, max_size, aspect_ratio, box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
variance, flip, clip, step, offset) variance, flip, clip, step, offset, None,
min_max_aspect_ratios_order)
box_results.append(box) box_results.append(box)
var_results.append(var) var_results.append(var)
......
...@@ -18,10 +18,12 @@ All util layers. ...@@ -18,10 +18,12 @@ All util layers.
from layer_function_generator import autodoc from layer_function_generator import autodoc
from ..framework import unique_name from ..framework import unique_name
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..annotations import deprecated
__all__ = ['get_places'] __all__ = []
@deprecated(since='0.15.0', instead="ParallelExecutor")
@autodoc() @autodoc()
def get_places(device_count=None, device_type=None): def get_places(device_count=None, device_type=None):
helper = LayerHelper('get_places', **locals()) helper = LayerHelper('get_places', **locals())
......
...@@ -22,9 +22,9 @@ from ..executor import global_scope ...@@ -22,9 +22,9 @@ from ..executor import global_scope
from layer_function_generator import generate_layer_fn, templatedoc from layer_function_generator import generate_layer_fn, templatedoc
__all__ = [ __all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv', 'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
'double_buffer', 'random_data_generator', 'Preprocessor', 'load' 'load'
] ]
...@@ -445,6 +445,88 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): ...@@ -445,6 +445,88 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
return monkey_patch_reader_methods(main_prog_var) return monkey_patch_reader_methods(main_prog_var)
def py_reader(capacity, shapes, dtypes, lod_levels=None):
"""
Create a reader and blocking queue for data feeding in Python
This layer returns a Reader Variable and a BlockingQueue.
The BlockingQueue provides `push()` method to push a `LoDTensorArray`
object into the queue in Python side. In C++ side, the Reader
Variable would invoke `pop()` method of the queue to retrieve the
feeding data. The process of feeding data in Python side and fetching
data in C++ side can run in parallel. The BlockingQueue should be closed
using `close()` method when unused.
Args:
capacity(int): The maximum capacity of the BlockingQueue.
shapes(list): List of tuples which declaring data shapes.
dtypes(list): List of strs which declaring data type.
lod_levels(list): List of ints which declaring data lod_level.
Returns:
tuple(Variable, BlockingQueue):
A Reader Variable from which we can get feeding data.
A BlockingQueue object for data feeding.
Examples:
.. code-block:: python
reader, queue = fluid.layers.py_reader(
capacity=10,
shapes=[[-1,3,224,224], [-1,1]],
dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.read_file(reader)
# Via the blocking queue, we can feed data using threads
def feed_data(queue, feed_images, feed_labels):
for feed_image, feed_label in zip(feed_images, feed_labels):
data = core.LoDTensorArray()
data.append(feed_image)
data.append(feed_label)
queue.push(data)
thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
thread.start()
"""
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = []
ranks = []
for shape in shapes:
shape_concat.extend(shape)
ranks.append(len(shape))
if lod_levels is None:
lod_levels = [0] * len(shapes)
queue_name = unique_name('lod_tensor_blocking_queue')
var = global_scope().var(queue_name)
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=unique_name('create_py_reader'))
startup_blk.append_op(
type='create_py_reader',
inputs={'blocking_queue': queue_name},
outputs={'Out': [startup_var]},
attrs={
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'ranks': ranks
})
startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var)
return monkey_patch_reader_methods(main_prog_var), feed_queue
def open_files(filenames, def open_files(filenames,
shapes, shapes,
lod_levels, lod_levels,
...@@ -719,7 +801,7 @@ class Preprocessor(object): ...@@ -719,7 +801,7 @@ class Preprocessor(object):
self.sink_var_names = None self.sink_var_names = None
self.status = Preprocessor.BEFORE_SUB_BLOCK self.status = Preprocessor.BEFORE_SUB_BLOCK
def is_completed(self): def _is_completed(self):
return self.sub_block and self.source_var_names and self.sink_var_names return self.sub_block and self.source_var_names and self.sink_var_names
@contextlib.contextmanager @contextlib.contextmanager
...@@ -729,7 +811,7 @@ class Preprocessor(object): ...@@ -729,7 +811,7 @@ class Preprocessor(object):
yield yield
self.main_prog.rollback() self.main_prog.rollback()
self.status = Preprocessor.AFTER_SUB_BLOCK self.status = Preprocessor.AFTER_SUB_BLOCK
if not self.is_completed(): if not self._is_completed():
raise RuntimeError( raise RuntimeError(
"The definition of preprocessor is incompleted! " "The definition of preprocessor is incompleted! "
"Please make sure that you have set input and output " "Please make sure that you have set input and output "
......
...@@ -16,8 +16,6 @@ from ..framework import Variable, unique_name ...@@ -16,8 +16,6 @@ from ..framework import Variable, unique_name
from layer_function_generator import OpProtoHolder from layer_function_generator import OpProtoHolder
from ..initializer import force_init_on_cpu from ..initializer import force_init_on_cpu
__all__ = ['monkey_patch_variable']
def monkey_patch_variable(): def monkey_patch_variable():
def unique_tmp_name(): def unique_tmp_name():
......
...@@ -76,7 +76,7 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -76,7 +76,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=200): def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -102,6 +102,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -102,6 +102,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'. curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
num_thresholds(int): The number of thresholds to use when discretizing num_thresholds(int): The number of thresholds to use when discretizing
the roc curve. Default 200. the roc curve. Default 200.
topk(int): only topk number of prediction output will be used for auc.
Returns: Returns:
Variable: A scalar representing the current AUC. Variable: A scalar representing the current AUC.
...@@ -115,7 +116,7 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -115,7 +116,7 @@ def auc(input, label, curve='ROC', num_thresholds=200):
""" """
warnings.warn( warnings.warn(
"This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \ "This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \
but can not aggregate them and get the pass AUC, because pass \ but can not aggregate them and get the pass AUC, because pass \
auc can not be averaged with weighted from the minibatch auc value. \ auc can not be averaged with weighted from the minibatch auc value. \
Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \ Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
...@@ -125,14 +126,34 @@ def auc(input, label, curve='ROC', num_thresholds=200): ...@@ -125,14 +126,34 @@ def auc(input, label, curve='ROC', num_thresholds=200):
topk_indices = helper.create_tmp_variable(dtype="int64") topk_indices = helper.create_tmp_variable(dtype="int64")
topk_out, topk_indices = nn.topk(input, k=k) topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32") auc_out = helper.create_tmp_variable(dtype="float32")
# make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable(persistable=True)
tn = helper.create_global_variable(persistable=True)
fp = helper.create_global_variable(persistable=True)
fn = helper.create_global_variable(persistable=True)
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer(
var, Constant(
value=0.0, force_cpu=True))
helper.append_op( helper.append_op(
type="auc", type="auc",
inputs={ inputs={
"Out": [topk_out], "Out": [topk_out],
"Indices": [topk_indices], "Indices": [topk_indices],
"Label": [label] "Label": [label],
"TP": [tp],
"TN": [tn],
"FP": [fp],
"FN": [fn]
}, },
attrs={"curve": curve, attrs={"curve": curve,
"num_thresholds": num_thresholds}, "num_thresholds": num_thresholds},
outputs={"AUC": [auc_out], }) outputs={
"AUC": [auc_out],
"TPOut": [tp],
"TNOut": [tn],
"FPOut": [fp],
"FNOut": [fn]
})
return auc_out return auc_out
...@@ -11,6 +11,20 @@ ...@@ -11,6 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
All layers just related to the neural network. All layers just related to the neural network.
""" """
...@@ -71,6 +85,7 @@ __all__ = [ ...@@ -71,6 +85,7 @@ __all__ = [
'transpose', 'transpose',
'im2sequence', 'im2sequence',
'nce', 'nce',
'hsigmoid',
'beam_search', 'beam_search',
'row_conv', 'row_conv',
'multiplex', 'multiplex',
...@@ -3857,6 +3872,74 @@ def nce(input, ...@@ -3857,6 +3872,74 @@ def nce(input,
return cost / (num_neg_samples + 1) return cost / (num_neg_samples + 1)
def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
"""
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each
internal node on the path, and sum them to get a total cost. hsigmoid can
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
Args:
input (Variable): The input tensor variable with shape
:math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N \\times 1]`.
num_classes: (int), The number of classes, must not be less than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for learnable parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for the bias of this layer. If it is set to False, no
bias will be applied.
Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[2], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
"""
helper = LayerHelper('hierarchical_sigmoid', **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
pre_out = helper.create_tmp_variable(dtype)
dim = input.shape[1]
if num_classes < 2:
raise ValueError("num_classes must not be less than 2.")
weights = helper.create_parameter(
attr=helper.param_attr,
shape=[num_classes - 1, dim],
is_bias=False,
dtype=input.dtype)
inputs = {"X": input, "W": weights, "Label": label}
if helper.bias_attr:
bias = helper.create_parameter(
attr=helper.bias_attr,
shape=[1, num_classes - 1],
is_bias=True,
dtype=input.dtype)
inputs['Bias'] = bias
helper.append_op(
type="hierarchical_sigmoid",
inputs=inputs,
outputs={"Out": out,
"PreOut": pre_out},
attrs={"num_classes": num_classes})
return out
def transpose(x, perm, name=None): def transpose(x, perm, name=None):
""" """
Permute the dimensions of `input` according to `perm`. Permute the dimensions of `input` according to `perm`.
...@@ -3900,7 +3983,13 @@ def transpose(x, perm, name=None): ...@@ -3900,7 +3983,13 @@ def transpose(x, perm, name=None):
return out return out
def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): def im2sequence(input,
filter_size=1,
stride=1,
padding=0,
input_image_size=None,
out_stride=1,
name=None):
""" """
Extracts image patches from the input tensor to form a tensor of shape Extracts image patches from the input tensor to form a tensor of shape
{input.batch_size * output_height * output_width, filter_size_H * {input.batch_size * output_height * output_width, filter_size_H *
...@@ -3937,6 +4026,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): ...@@ -3937,6 +4026,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
padding_up = padding_down = padding_left = padding_right = padding padding_up = padding_down = padding_left = padding_right = padding
Default: padding = 0. Default: padding = 0.
input_image_size(Variable): the input contains image real size.It's dim
is [batchsize, 2]. It is dispensable.It is just for batch inference.
out_stride(int|tuple): The scaling of image through CNN. It is
dispensable. It is valid only when input_image_size is not null.
If out_stride is tuple, it must contain two intergers,
(out_stride_H, out_stride_W). Otherwise,
the out_stride_H = out_stride_W = out_stride.
name (int): The name of this layer. It is optional. name (int): The name of this layer. It is optional.
Returns: Returns:
...@@ -3987,7 +4085,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): ...@@ -3987,7 +4085,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
[ 5. 7. 2. 4. 1. 3. 9. 0.] [ 5. 7. 2. 4. 1. 3. 9. 0.]
[ 7. 9. 4. 8. 3. 5. 0. 8.]] [ 7. 9. 4. 8. 3. 5. 0. 8.]]
output.dims = {8, 9} output.dims = {8, 8}
output.lod = [[4, 4]] output.lod = [[4, 4]]
...@@ -4009,18 +4107,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): ...@@ -4009,18 +4107,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
if len(padding) == 2: if len(padding) == 2:
padding.append(padding[0]) padding.append(padding[0])
padding.append(padding[1]) padding.append(padding[1])
inputs = {"X": input}
attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
if input_image_size:
if isinstance(out_stride, int):
out_stride = [out_stride, out_stride]
inputs["Y"] = input_image_size
attrs["out_stride"] = out_stride
helper = LayerHelper('im2sequence', **locals()) helper = LayerHelper('im2sequence', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype()) out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='im2sequence', type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
inputs={'X': input},
outputs={'Out': out},
attrs={
'kernels': filter_size,
'strides': stride,
'paddings': padding,
})
return out return out
...@@ -4270,7 +4367,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): ...@@ -4270,7 +4367,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
helper.set_variable_initializer( helper.set_variable_initializer(
counter, initializer=Constant( counter, initializer=Constant(
value=begin - 1, force_cpu=True)) value=begin - 1, force_cpu=True))
helper.main_program.global_block().prepend_op( helper.main_program.global_block()._prepend_op(
type='increment', type='increment',
inputs={'X': [counter]}, inputs={'X': [counter]},
outputs={'Out': [counter]}, outputs={'Out': [counter]},
......
...@@ -29,7 +29,7 @@ __all__ = [ ...@@ -29,7 +29,7 @@ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer' 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
] ]
...@@ -67,7 +67,7 @@ class Optimizer(object): ...@@ -67,7 +67,7 @@ class Optimizer(object):
self._LARS_weight_decay = LARS_weight_decay self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self.global_learning_rate() lr = self._global_learning_rate()
if isinstance(lr, framework.Variable): if isinstance(lr, framework.Variable):
return return
...@@ -86,7 +86,7 @@ class Optimizer(object): ...@@ -86,7 +86,7 @@ class Optimizer(object):
dtype='float32' if self._dtype == None else self._dtype, dtype='float32' if self._dtype == None else self._dtype,
persistable=True) persistable=True)
def global_learning_rate(self, program=None): def _global_learning_rate(self, program=None):
""" """
get global decayed learning rate get global decayed learning rate
:return: :return:
...@@ -110,9 +110,9 @@ class Optimizer(object): ...@@ -110,9 +110,9 @@ class Optimizer(object):
return param_lr return param_lr
else: else:
if param_lr == 1.0: if param_lr == 1.0:
return self.global_learning_rate() return self._global_learning_rate()
else: else:
return self.global_learning_rate() * param_lr return self._global_learning_rate() * param_lr
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -123,7 +123,7 @@ class Optimizer(object): ...@@ -123,7 +123,7 @@ class Optimizer(object):
""" """
pass pass
def _finish_update(self, block): def _finish_update(self, block, parameters_and_grads):
"""Finish any custom updates needed """Finish any custom updates needed
before completing an optimization step before completing an optimization step
...@@ -132,7 +132,7 @@ class Optimizer(object): ...@@ -132,7 +132,7 @@ class Optimizer(object):
parameters: list of parameter variables for the optimizer parameters: list of parameter variables for the optimizer
Returns: Returns:
list of finish ops or None None
""" """
pass pass
...@@ -185,7 +185,7 @@ class Optimizer(object): ...@@ -185,7 +185,7 @@ class Optimizer(object):
format(name, param.name)) format(name, param.name))
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
def create_optimization_pass(self, def _create_optimization_pass(self,
parameters_and_grads, parameters_and_grads,
loss, loss,
startup_program=None): startup_program=None):
...@@ -221,25 +221,26 @@ class Optimizer(object): ...@@ -221,25 +221,26 @@ class Optimizer(object):
self._create_global_learning_rate() self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0: if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads, layers.append_LARS(parameters_and_grads,
self.global_learning_rate(), self._global_learning_rate(),
self._LARS_weight_decay) self._LARS_weight_decay)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
with param_and_grad[0].block.program.optimized_guard( with param_and_grad[0].block.program.optimized_guard(
param_and_grad[0]): param_and_grad):
if param_and_grad[0].trainable is True and param_and_grad[ if param_and_grad[0].trainable is True:
1] is not None:
optimize_op = self._append_optimize_op(loss.block, optimize_op = self._append_optimize_op(loss.block,
param_and_grad) param_and_grad)
optimize_ops.append(optimize_op) optimize_ops.append(optimize_op)
# Get custom finish ops for subclasses # Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies # FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(loss.block) self._finish_update(loss.block, parameters_and_grads)
end = len(global_block.ops) end = len(global_block.ops)
return global_block.slice_ops(start, end) return global_block._slice_ops(start, end)
def minimize(self, def minimize(self,
loss, loss,
...@@ -262,7 +263,7 @@ class Optimizer(object): ...@@ -262,7 +263,7 @@ class Optimizer(object):
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
self.regularization) self.regularization)
optimize_ops = self.create_optimization_pass(params_grads, loss, optimize_ops = self._create_optimization_pass(params_grads, loss,
startup_program) startup_program)
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer): ...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer): ...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer):
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block()
# Create beta1 and beta2 power tensors
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
self._beta2_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta2_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta2_pow_acc, initializer=Constant(self._beta2))
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p) self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta2,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer): ...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str, moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
# create the adam optimize op # create the adam optimize op
adam_op = block.append_op( adam_op = block.append_op(
type=self.type, type=self.type,
...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer): ...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1, "Moment1": moment1,
"Moment2": moment2, "Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc, "Beta1Pow": beta1_pow_acc,
"Beta2Pow": self._beta2_pow_acc "Beta2Pow": beta2_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -566,25 +564,31 @@ class AdamOptimizer(Optimizer): ...@@ -566,25 +564,31 @@ class AdamOptimizer(Optimizer):
return adam_op return adam_op
def _finish_update(self, block): def _finish_update(self, block, param_and_grads):
"""Update Beta1 and Beta2 Power accumulators """Update Beta1 and Beta2 Power accumulators
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param, grad in param_and_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta1_pow_acc}, inputs={"X": beta1_pow_acc},
outputs={"Out": self._beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1}) attrs={"scale": self._beta1})
scale_beta2 = main_block.append_op( main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta2_pow_acc}, inputs={"X": beta2_pow_acc},
outputs={"Out": self._beta2_pow_acc}, outputs={"Out": beta2_pow_acc},
attrs={"scale": self._beta2}) attrs={"scale": self._beta2})
return [scale_beta1, scale_beta2]
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
""" """
...@@ -626,6 +630,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -626,6 +630,7 @@ class AdamaxOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
_beta1_pow_acc_str = "beta1_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -645,21 +650,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,21 +650,16 @@ class AdamaxOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -667,6 +667,8 @@ class AdamaxOptimizer(Optimizer): ...@@ -667,6 +667,8 @@ class AdamaxOptimizer(Optimizer):
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
inf_norm = self._get_accumulator(self._inf_norm_acc_str, inf_norm = self._get_accumulator(self._inf_norm_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
# create the adamax optimize op # create the adamax optimize op
adamax_op = block.append_op( adamax_op = block.append_op(
type=self.type, type=self.type,
...@@ -676,7 +678,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -676,7 +678,7 @@ class AdamaxOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment, "Moment": moment,
"InfNorm": inf_norm, "InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc "Beta1Pow": beta1_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -691,19 +693,23 @@ class AdamaxOptimizer(Optimizer): ...@@ -691,19 +693,23 @@ class AdamaxOptimizer(Optimizer):
return adamax_op return adamax_op
def _finish_update(self, block): def _finish_update(self, block, parameters_and_grads):
"""Update Beta1 Power accumulator """Update Beta1 Power accumulator
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param, grad in parameters_and_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta1_pow_acc}, inputs={"X": beta1_pow_acc},
outputs={"Out": self._beta1_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1}) attrs={"scale": self._beta1})
return [scale_beta1]
class DecayedAdagradOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer):
""" """
...@@ -1156,6 +1162,9 @@ class ModelAverage(Optimizer): ...@@ -1156,6 +1162,9 @@ class ModelAverage(Optimizer):
self.params_grads.append((param, grad)) self.params_grads.append((param, grad))
for param, grad in self.params_grads: for param, grad in self.params_grads:
if grad is None:
continue
with param.block.program.optimized_guard([param, grad]):
self._append_average_accumulate_op(param) self._append_average_accumulate_op(param)
self.apply_program = Program() self.apply_program = Program()
...@@ -1171,16 +1180,16 @@ class ModelAverage(Optimizer): ...@@ -1171,16 +1180,16 @@ class ModelAverage(Optimizer):
self._add_average_restore_op(block, param_grad) self._add_average_restore_op(block, param_grad)
def _add_average_apply_op(self, block, param_grad): def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0]) param = block._clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1]) grad = block._clone_variable(param_grad[1])
sum_1 = block.clone_variable(self._get_accumulator('sum_1', param)) sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
sum_2 = block.clone_variable(self._get_accumulator('sum_2', param)) sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
sum_3 = block.clone_variable(self._get_accumulator('sum_3', param)) sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
num_accumulates = block.clone_variable( num_accumulates = block._clone_variable(
self._get_accumulator('num_accumulates', param)) self._get_accumulator('num_accumulates', param))
old_num_accumulates = block.clone_variable( old_num_accumulates = block._clone_variable(
self._get_accumulator('old_num_accumulates', param)) self._get_accumulator('old_num_accumulates', param))
num_updates = block.clone_variable( num_updates = block._clone_variable(
self._get_accumulator('num_updates', param)) self._get_accumulator('num_updates', param))
# backup param value to grad # backup param value to grad
layers.assign(input=param, output=grad) layers.assign(input=param, output=grad)
...@@ -1194,8 +1203,8 @@ class ModelAverage(Optimizer): ...@@ -1194,8 +1203,8 @@ class ModelAverage(Optimizer):
layers.elementwise_div(x=sum, y=tmp, out=param) layers.elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad): def _add_average_restore_op(self, block, param_grad):
param = block.clone_variable(param_grad[0]) param = block._clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1]) grad = block._clone_variable(param_grad[1])
layers.assign(input=grad, output=param) layers.assign(input=grad, output=param)
def _append_average_accumulate_op(self, param): def _append_average_accumulate_op(self, param):
......
...@@ -152,7 +152,7 @@ class ParallelExecutor(object): ...@@ -152,7 +152,7 @@ class ParallelExecutor(object):
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
self._places, self._places,
set([ set([
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block()._iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
set(self.persistable_vars), main.desc, loss_name set(self.persistable_vars), main.desc, loss_name
......
...@@ -67,7 +67,7 @@ class ParamAttr(object): ...@@ -67,7 +67,7 @@ class ParamAttr(object):
self.gradient_clip = gradient_clip self.gradient_clip = gradient_clip
self.model_average = do_model_average self.model_average = do_model_average
def set_default_initializer(self, initializer): def _set_default_initializer(self, initializer):
""" """
Set the default initializer, the initializer should be Constant, Set the default initializer, the initializer should be Constant,
Uniform, Normal, Xavier, MSRA. Uniform, Normal, Xavier, MSRA.
...@@ -88,7 +88,7 @@ class ParamAttr(object): ...@@ -88,7 +88,7 @@ class ParamAttr(object):
self.initializer = initializer self.initializer = initializer
def set_default_param_initializer(self): def _set_default_param_initializer(self):
""" """
Set the default initializer for the parameter with Xavier. Set the default initializer for the parameter with Xavier.
...@@ -98,9 +98,9 @@ class ParamAttr(object): ...@@ -98,9 +98,9 @@ class ParamAttr(object):
Returns: Returns:
None. None.
""" """
self.set_default_initializer(Xavier()) self._set_default_initializer(Xavier())
def set_default_bias_initializer(self): def _set_default_bias_initializer(self):
""" """
Set the default initializer for the bias with Constant(0.0). Set the default initializer for the bias with Constant(0.0).
...@@ -110,10 +110,10 @@ class ParamAttr(object): ...@@ -110,10 +110,10 @@ class ParamAttr(object):
Returns: Returns:
None. None.
""" """
self.set_default_initializer(Constant(0.0)) self._set_default_initializer(Constant(0.0))
@staticmethod @staticmethod
def to_attr(arg): def _to_attr(arg):
""" """
Create ParamAttr[s]. Create ParamAttr[s].
...@@ -131,7 +131,7 @@ class ParamAttr(object): ...@@ -131,7 +131,7 @@ class ParamAttr(object):
if arg is None: if arg is None:
return ParamAttr() return ParamAttr()
elif isinstance(arg, list) or isinstance(arg, tuple): elif isinstance(arg, list) or isinstance(arg, tuple):
return [ParamAttr.to_attr(a) for a in arg] return [ParamAttr._to_attr(a) for a in arg]
elif isinstance(arg, ParamAttr): elif isinstance(arg, ParamAttr):
return arg return arg
elif isinstance(arg, str) or isinstance(arg, unicode): elif isinstance(arg, str) or isinstance(arg, unicode):
...@@ -141,11 +141,11 @@ class ParamAttr(object): ...@@ -141,11 +141,11 @@ class ParamAttr(object):
elif isinstance(arg, WeightDecayRegularizer): elif isinstance(arg, WeightDecayRegularizer):
return ParamAttr(regularizer=arg) return ParamAttr(regularizer=arg)
elif isinstance(arg, bool): elif isinstance(arg, bool):
return ParamAttr.to_attr(None) if arg else False return ParamAttr._to_attr(None) if arg else False
else: else:
raise TypeError("{0} cast to ParamAttr".format(type(arg))) raise TypeError("{0} cast to ParamAttr".format(type(arg)))
def to_kwargs(self, with_initializer=False): def _to_kwargs(self, with_initializer=False):
""" """
Returns the attributes of this parameter. Returns the attributes of this parameter.
......
...@@ -15,10 +15,7 @@ ...@@ -15,10 +15,7 @@
import framework import framework
from . import core from . import core
__all__ = [ __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
'L2DecayRegularizer'
]
def append_regularization_ops(parameters_and_grads, regularization=None): def append_regularization_ops(parameters_and_grads, regularization=None):
...@@ -44,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None): ...@@ -44,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
""" """
params_and_grads = [] params_and_grads = []
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
with param.block.program.optimized_guard(param):
# If no gradient then we don't need to do anything # If no gradient then we don't need to do anything
if grad is None: if grad is None:
params_and_grads.append((param, grad)) params_and_grads.append((param, grad))
continue continue
with param.block.program.optimized_guard([param, grad]):
regularization_term = None regularization_term = None
if param.regularizer is not None: if param.regularizer is not None:
# Add variable for regularization term in grad block # Add variable for regularization term in grad block
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
from paddle.fluid.layers.device import get_places
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
...@@ -144,7 +144,7 @@ def train(word_dict, ...@@ -144,7 +144,7 @@ def train(word_dict,
cost, acc_out, prediction = net_method( cost, acc_out, prediction = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
else: else:
places = fluid.layers.get_places() places = get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places)
with pd.do(): with pd.do():
cost, acc, _ = net_method( cost, acc, _ = net_method(
......
...@@ -12,15 +12,17 @@ ...@@ -12,15 +12,17 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle
import sys
import numpy
import unittest
import math import math
import sys
import os import os
import sys
import unittest
import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
BATCH_SIZE = 64 BATCH_SIZE = 64
...@@ -76,7 +78,7 @@ def train(nn_type, ...@@ -76,7 +78,7 @@ def train(nn_type,
net_conf = conv_net net_conf = conv_net
if parallel: if parallel:
places = fluid.layers.get_places() places = get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places)
with pd.do(): with pd.do():
img_ = pd.read_input(img) img_ = pd.read_input(img)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
import unittest import unittest
import os import os
import numpy as np import numpy as np
...@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True): ...@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
avg_cost, predict_word = __network__( avg_cost, predict_word = __network__(
[first_word, second_word, third_word, forth_word, next_word]) [first_word, second_word, third_word, forth_word, next_word])
else: else:
places = fluid.layers.get_places() places = get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places)
with pd.do(): with pd.do():
avg_cost, predict_word = __network__( avg_cost, predict_word = __network__(
......
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import paddle
import paddle.fluid as fluid
import math import math
import sys import sys
import paddle
import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
# need to fix random seed and training data to compare the loss # need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization # value accurately calculated by the default and the memory optimization
# version. # version.
...@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda(): ...@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda():
use_nccl = False use_nccl = False
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
places = fluid.layers.get_places(device_count=0, device_type=device_type) places = get_places(device_count=0, device_type=device_type)
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
with pd.do(): with pd.do():
x_ = pd.read_input(x) x_ = pd.read_input(x)
......
...@@ -60,8 +60,8 @@ def get_numeric_gradient(place, ...@@ -60,8 +60,8 @@ def get_numeric_gradient(place,
return np.array(sum).mean() return np.array(sum).mean()
tensor_to_check = scope.find_var(input_to_check).get_tensor() tensor_to_check = scope.find_var(input_to_check).get_tensor()
tensor_size = product(tensor_to_check.get_dims()) tensor_size = product(tensor_to_check.shape())
tensor_to_check_dtype = tensor_to_check.dtype() tensor_to_check_dtype = tensor_to_check._dtype()
if tensor_to_check_dtype == core.VarDesc.VarType.FP32: if tensor_to_check_dtype == core.VarDesc.VarType.FP32:
tensor_to_check_dtype = np.float32 tensor_to_check_dtype = np.float32
elif tensor_to_check_dtype == core.VarDesc.VarType.FP64: elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
...@@ -74,15 +74,15 @@ def get_numeric_gradient(place, ...@@ -74,15 +74,15 @@ def get_numeric_gradient(place,
def __get_elem__(tensor, i): def __get_elem__(tensor, i):
if tensor_to_check_dtype == np.float32: if tensor_to_check_dtype == np.float32:
return tensor.get_float_element(i) return tensor._get_float_element(i)
else: else:
return tensor.get_double_element(i) return tensor._get_double_element(i)
def __set_elem__(tensor, i, e): def __set_elem__(tensor, i, e):
if tensor_to_check_dtype == np.float32: if tensor_to_check_dtype == np.float32:
tensor.set_float_element(i, e) tensor._set_float_element(i, e)
else: else:
tensor.set_double_element(i, e) tensor._set_double_element(i, e)
# we only compute gradient of one element each time. # we only compute gradient of one element each time.
# we use a for loop to compute the gradient of every element. # we use a for loop to compute the gradient of every element.
...@@ -107,7 +107,7 @@ def get_numeric_gradient(place, ...@@ -107,7 +107,7 @@ def get_numeric_gradient(place,
__set_elem__(tensor_to_check, i, origin) __set_elem__(tensor_to_check, i, origin)
gradient_flat[i] = (y_pos - y_neg) / delta / 2 gradient_flat[i] = (y_pos - y_neg) / delta / 2
return gradient_flat.reshape(tensor_to_check.get_dims()) return gradient_flat.reshape(tensor_to_check.shape())
class OpTest(unittest.TestCase): class OpTest(unittest.TestCase):
...@@ -125,7 +125,7 @@ class OpTest(unittest.TestCase): ...@@ -125,7 +125,7 @@ class OpTest(unittest.TestCase):
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
'''Restore random seeds''' """Restore random seeds"""
np.random.set_state(cls._np_rand_state) np.random.set_state(cls._np_rand_state)
random.setstate(cls._py_rand_state) random.setstate(cls._py_rand_state)
......
...@@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_dict=None, feed_dict=None,
seed=None, seed=None,
use_parallel_executor=True, use_parallel_executor=True,
balance_parameter_opt_between_cards=False): use_reduce=False,
optimizer=fluid.optimizer.Adam):
def run_executor(exe, feed, fetch_list, program=None): def run_executor(exe, feed, fetch_list, program=None):
if isinstance(exe, fluid.ParallelExecutor): if isinstance(exe, fluid.ParallelExecutor):
res = exe.run(fetch_list=fetch_list, feed=feed) res = exe.run(fetch_list=fetch_list, feed=feed)
...@@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
startup.random_seed = 1 # Fix random seed startup.random_seed = 1 # Fix random seed
main.random_seed = 1
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
if seed is not None: if seed is not None:
startup.random_seed = seed startup.random_seed = seed
main.random_seed = seed
loss = method(use_feed=feed_dict is not None) loss = method(use_feed=feed_dict is not None)
adam = fluid.optimizer.Adam()
adam.minimize(loss) optimizer().minimize(loss)
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(startup) startup_exe.run(startup)
...@@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy.allow_op_delay = allow_op_delay exec_strategy.allow_op_delay = allow_op_delay
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
if use_parallel_executor: if use_parallel_executor:
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
......
...@@ -24,7 +24,20 @@ class TestAucOp(OpTest): ...@@ -24,7 +24,20 @@ class TestAucOp(OpTest):
indices = np.random.randint(0, 2, (128, 2)) indices = np.random.randint(0, 2, (128, 2))
labels = np.random.randint(0, 2, (128, 1)) labels = np.random.randint(0, 2, (128, 1))
num_thresholds = 200 num_thresholds = 200
self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels} tp = np.zeros((num_thresholds, )).astype("int64")
tn = np.zeros((num_thresholds, )).astype("int64")
fp = np.zeros((num_thresholds, )).astype("int64")
fn = np.zeros((num_thresholds, )).astype("int64")
self.inputs = {
'Out': pred,
'Indices': indices,
'Label': labels,
'TP': tp,
'TN': tn,
'FP': fp,
'FN': fn
}
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
# NOTE: sklearn use a different way to generate thresholds # NOTE: sklearn use a different way to generate thresholds
# which will cause the result differs slightly: # which will cause the result differs slightly:
...@@ -71,7 +84,13 @@ class TestAucOp(OpTest): ...@@ -71,7 +84,13 @@ class TestAucOp(OpTest):
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
auc_value = np.sum(x * y) auc_value = np.sum(x * y)
self.outputs = {'AUC': auc_value} self.outputs = {
'AUC': auc_value,
'TPOut': tp_list,
'FNOut': fn_list,
'TNOut': tn_list,
'FPOut': fp_list
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
...@@ -129,7 +129,6 @@ def create_or_get_tensor(scope, var_name, var, place): ...@@ -129,7 +129,6 @@ def create_or_get_tensor(scope, var_name, var, place):
if var is not None: if var is not None:
assert isinstance(var, np.ndarray) assert isinstance(var, np.ndarray)
tensor.set_recursive_sequence_lengths([]) tensor.set_recursive_sequence_lengths([])
tensor.set_dims(var.shape)
tensor.set(var, place) tensor.set(var, place)
return tensor return tensor
......
...@@ -16,8 +16,6 @@ import unittest ...@@ -16,8 +16,6 @@ import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
from paddle.fluid.backward import calc_gradient from paddle.fluid.backward import calc_gradient
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
import os
import tempfile
class TestCheckpoint(unittest.TestCase):
def setUp(self):
self.dirname = tempfile.mktemp()
self.max_num_checkpoints = 3
self.epoch_interval = 1
self.step_interval = 1
self.trainer_id = 0
self.chief = self.trainer_id == 0
self.place = fluid.CPUPlace()
self.epoch_id = 100
self.step_id = 20
def test_checkpoint(self):
self.save_checkpoint()
serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
self.assertTrue(serial >= 0)
trainer_args = ["epoch_id", "step_id"]
epoch_id, step_id = fluid.io.load_trainer_args(
self.dirname, serial, self.trainer_id, trainer_args)
self.assertEqual(self.step_id, int(step_id))
self.assertEqual(self.epoch_id, int(epoch_id))
program = fluid.Program()
with fluid.program_guard(program):
exe = fluid.Executor(self.place)
fluid.io.load_checkpoint(exe, self.dirname, serial, program)
fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
self.assertFalse(os.path.isdir(self.dirname))
def save_checkpoint(self):
config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
self.epoch_interval, self.step_interval)
trainer_args = {}
trainer_args["epoch_id"] = self.epoch_id
trainer_args["step_id"] = self.step_id
program = fluid.Program()
with fluid.program_guard(program):
program.global_block().create_var(
name="scale_0",
psersistable=True,
dtype="float32",
shape=[32, 32])
exe = fluid.Executor(self.place)
for i in xrange(10):
fluid.io.save_checkpoint(exe, config.checkpoint_dir,
self.trainer_id, trainer_args, program,
config.max_num_checkpoints)
if __name__ == '__main__':
unittest.main()
...@@ -22,6 +22,9 @@ import numpy ...@@ -22,6 +22,9 @@ import numpy
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.layers.io import ListenAndServ
from paddle.fluid.layers.io import Recv
from paddle.fluid.layers.io import Send
class TestSendOp(unittest.TestCase): class TestSendOp(unittest.TestCase):
...@@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase): ...@@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
serv = layers.ListenAndServ( serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
"127.0.0.1:0", ["X"], optimizer_mode=False)
with serv.do(): with serv.do():
out_var = main.global_block().create_var( out_var = main.global_block().create_var(
name="scale_0.tmp_0", name="scale_0.tmp_0",
...@@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase): ...@@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase):
persistable=False, persistable=False,
shape=[32, 32]) shape=[32, 32])
fluid.initializer.Constant(value=2.3)(get_var, main.global_block()) fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
layers.Send("127.0.0.1:%d" % port, [x]) Send("127.0.0.1:%d" % port, [x])
o = layers.Recv("127.0.0.1:%d" % port, [get_var]) o = Recv("127.0.0.1:%d" % port, [get_var])
exe = fluid.Executor(place) exe = fluid.Executor(place)
self.dist_out = exe.run(main, fetch_list=o) # o is a list self.dist_out = exe.run(main, fetch_list=o) # o is a list
......
...@@ -27,7 +27,6 @@ class TranspilerTest(unittest.TestCase): ...@@ -27,7 +27,6 @@ class TranspilerTest(unittest.TestCase):
self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175" self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
self.pserver1_ep = "127.0.0.1:6174" self.pserver1_ep = "127.0.0.1:6174"
self.pserver2_ep = "127.0.0.1:6175" self.pserver2_ep = "127.0.0.1:6175"
self.slice_var_up = True
self.sync_mode = True self.sync_mode = True
self.transpiler = None self.transpiler = None
...@@ -52,27 +51,26 @@ class TranspilerTest(unittest.TestCase): ...@@ -52,27 +51,26 @@ class TranspilerTest(unittest.TestCase):
self.origin_prog = main.clone() self.origin_prog = main.clone()
return main return main
def get_trainer(self): def get_trainer(self, config=None):
t = self._transpiler_instance() t = self._transpiler_instance(config)
return t.get_trainer_program() return t.get_trainer_program()
def get_pserver(self, ep): def get_pserver(self, ep, config=None):
t = self._transpiler_instance() t = self._transpiler_instance(config)
pserver = t.get_pserver_program(ep) pserver = t.get_pserver_program(ep)
startup = t.get_startup_program(ep, pserver) startup = t.get_startup_program(ep, pserver)
return pserver, startup return pserver, startup
def _transpiler_instance(self): def _transpiler_instance(self, config=None):
if not self.transpiler: if not self.transpiler:
main = self.get_main_program() main = self.get_main_program()
self.transpiler = fluid.DistributeTranspiler() self.transpiler = fluid.DistributeTranspiler(config=config)
self.transpiler.transpile( self.transpiler.transpile(
self.trainer_id, self.trainer_id,
program=main, program=main,
pservers=self.pserver_eps, pservers=self.pserver_eps,
trainers=self.trainers, trainers=self.trainers)
slice_var_up=self.slice_var_up,
sync_mode=self.sync_mode)
return self.transpiler return self.transpiler
...@@ -124,14 +122,67 @@ class TestBasicModel(TranspilerTest): ...@@ -124,14 +122,67 @@ class TestBasicModel(TranspilerTest):
self.assertEqual(set(pserver_params), set(trainer_params)) self.assertEqual(set(pserver_params), set(trainer_params))
class TestBasicModelWithLargeBlockSize(TranspilerTest):
def test_transpiler(self):
config = fluid.DistributeTranspilerConfig()
config.min_block_size = 1048576
pserver, startup = self.get_pserver(self.pserver1_ep, config)
pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
trainer = self.get_trainer(config)
self.assertEqual([op.type for op in trainer.global_block().ops], [
'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
'elementwise_add_grad', 'send', 'mul_grad', 'send', 'send_barrier',
'recv', 'recv', 'fetch_barrier'
])
self.assertEqual(len(pserver.blocks), 2)
# block0: listen_and_serv
self.assertEqual([op.type for op in pserver.blocks[0].ops],
["listen_and_serv"])
# block1~2: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "sgd"])
# confirm startup program
self.assertEqual([op.type for op in startup.global_block().ops],
["fill_constant", "fill_constant", "fill_constant"])
# the variable #fc_w will be split into two blocks
fc_w_var = startup2.global_block().var("fc_w")
self.assertEqual(fc_w_var.shape, (1000L, 1000L))
# all parameters should be optimized on pserver
pserver_params = []
for prog in [pserver, pserver2]:
for blk in prog.blocks:
for op in blk.ops:
if "Param" in op.input_names:
param_name = op.input("Param")[0]
is_block_idx = param_name.find(".block")
if is_block_idx != -1:
origin_param_name = param_name[:is_block_idx]
else:
origin_param_name = param_name
pserver_params.append(origin_param_name)
trainer_params = []
for op in self.origin_prog.global_block().ops:
if "Param" in op.input_names:
trainer_params.append(op.input("Param")[0])
self.assertEqual(set(pserver_params), set(trainer_params))
class TestNoSliceVar(TranspilerTest): class TestNoSliceVar(TranspilerTest):
def setUp(self): def setUp(self):
super(TestNoSliceVar, self).setUp() super(TestNoSliceVar, self).setUp()
self.slice_var_up = False
def test_transpiler(self): def test_transpiler(self):
_, startup = self.get_pserver(self.pserver1_ep) config = fluid.DistributeTranspilerConfig()
_, startup2 = self.get_pserver(self.pserver2_ep) config.slice_var_up = False
_, startup = self.get_pserver(self.pserver1_ep, config)
_, startup2 = self.get_pserver(self.pserver2_ep, config)
if startup.global_block().vars.has_key("fc_w"): if startup.global_block().vars.has_key("fc_w"):
fc_w_var = startup.global_block().vars["fc_w"] fc_w_var = startup.global_block().vars["fc_w"]
...@@ -253,10 +304,50 @@ class TestL2Decay(TranspilerTest): ...@@ -253,10 +304,50 @@ class TestL2Decay(TranspilerTest):
# TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
# FIXME(typhoonzero): need to add test for async case: class TestL2DecayWithPiecewise(TranspilerTest):
# see https://github.com/PaddlePaddle/Paddle/issues/11691 def net_conf(self):
class TestAsyncSGD(TranspilerTest): x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
pass y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
base_lr = 1.0
bd = [1, 10, 20, 30]
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
sgd_optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
sgd_optimizer.minimize(avg_cost)
return
def test_transpiler(self):
pserver, startup = self.get_pserver(self.pserver1_ep)
trainer = self.get_trainer()
self.assertEqual(len(pserver.blocks), 9)
self.assertEqual([op.type for op in pserver.blocks[1].ops], [
"increment", "cast", "fill_constant", "fill_constant", "less_than",
"logical_not", "conditional_block", "fill_constant",
"fill_constant", "less_than", "logical_not", "logical_and",
"logical_and", "conditional_block", "fill_constant",
"fill_constant", "less_than", "logical_not", "logical_and",
"logical_and", "conditional_block", "fill_constant",
"fill_constant", "less_than", "logical_not", "logical_and",
"logical_and", "conditional_block", "fill_constant",
"conditional_block"
])
self.assertEqual(
[op.type for op in pserver.blocks[7].ops],
["sum", "scale", "scale", "elementwise_add", "momentum"])
self.assertEqual(
[op.type for op in pserver.blocks[8].ops],
["sum", "scale", "scale", "elementwise_add", "momentum"])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -65,10 +65,10 @@ class TestDyRnnStaticInput(unittest.TestCase): ...@@ -65,10 +65,10 @@ class TestDyRnnStaticInput(unittest.TestCase):
return self._lodtensor_to_ndarray(fetch_outs[0]) return self._lodtensor_to_ndarray(fetch_outs[0])
def _lodtensor_to_ndarray(self, lod_tensor): def _lodtensor_to_ndarray(self, lod_tensor):
dims = lod_tensor.get_dims() dims = lod_tensor.shape()
ndarray = np.zeros(shape=dims).astype('float32') ndarray = np.zeros(shape=dims).astype('float32')
for i in xrange(np.product(dims)): for i in xrange(np.product(dims)):
ndarray.ravel()[i] = lod_tensor.get_float_element(i) ndarray.ravel()[i] = lod_tensor._get_float_element(i)
return ndarray, lod_tensor.recursive_sequence_lengths() return ndarray, lod_tensor.recursive_sequence_lengths()
def build_graph(self, only_forward=False): def build_graph(self, only_forward=False):
...@@ -185,19 +185,19 @@ class TestDyRnnStaticInput(unittest.TestCase): ...@@ -185,19 +185,19 @@ class TestDyRnnStaticInput(unittest.TestCase):
actual_gradients, actual_lod = self.fetch_value(static_input_grad) actual_gradients, actual_lod = self.fetch_value(static_input_grad)
static_input_shape = self.static_input_tensor.get_dims() static_input_shape = self.static_input_tensor.shape()
numeric_gradients = np.zeros(shape=static_input_shape).astype('float32') numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
# calculate numeric gradients # calculate numeric gradients
tensor_size = np.product(static_input_shape) tensor_size = np.product(static_input_shape)
for i in xrange(tensor_size): for i in xrange(tensor_size):
origin = self.static_input_tensor.get_float_element(i) origin = self.static_input_tensor._get_float_element(i)
x_pos = origin + self._delta x_pos = origin + self._delta
self.static_input_tensor.set_float_element(i, x_pos) self.static_input_tensor._set_float_element(i, x_pos)
y_pos = self.fetch_value(loss)[0][0] y_pos = self.fetch_value(loss)[0][0]
x_neg = origin - self._delta x_neg = origin - self._delta
self.static_input_tensor.set_float_element(i, x_neg) self.static_input_tensor._set_float_element(i, x_neg)
y_neg = self.fetch_value(loss)[0][0] y_neg = self.fetch_value(loss)[0][0]
self.static_input_tensor.set_float_element(i, origin) self.static_input_tensor._set_float_element(i, origin)
numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2 numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001)) self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
self.assertTrue( self.assertTrue(
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestFakeQuantizeOp(OpTest):
def setUp(self):
self.op_type = "fake_quantize"
self.attrs = {
'bit_length': 8,
'quantize_type': 'abs_max',
'window_size': 10000
}
self.inputs = {
'X': np.random.random((10, 10)).astype("float32"),
'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
'InCurrentIter': np.zeros(1).astype("float32"),
'InMovingScale': np.zeros(1).astype("float32")
}
self.scale = {
'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
}
self.outputs = {
'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
(1 << (self.attrs['bit_length'] - 1)) - 1)),
'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
'OutMovingScale':
np.array([self.scale['abs_max']]).astype("float32"),
'OutCurrentIter': np.zeros(1).astype("float32")
}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
import decorators import decorators
import unittest import unittest
...@@ -20,7 +21,7 @@ import unittest ...@@ -20,7 +21,7 @@ import unittest
class TestGetPlaces(unittest.TestCase): class TestGetPlaces(unittest.TestCase):
@decorators.prog_scope() @decorators.prog_scope()
def test_get_places(self): def test_get_places(self):
places = fluid.layers.get_places() places = get_places()
cpu = fluid.CPUPlace() cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu) exe = fluid.Executor(cpu)
exe.run(fluid.default_main_program()) exe.run(fluid.default_main_program())
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import math
from op_test import OpTest
def find_latest_set(num):
return 1 + int(math.floor(math.log(num, 2)))
class CodeTable(object):
def __init__(self, num_classes, code):
self.c = num_classes + code
def cal_index(self, bit):
return (self.c >> (bit + 1)) - 1
def get_length(self):
return find_latest_set(self.c) - 1
def cal_bit(self, bit):
return self.c & (1 << bit)
def hsigmoid(x, w, label, bias, num_classes):
batch_size = x.shape[0]
code_length = find_latest_set(num_classes - 1)
code_table = [0 for _ in range(code_length)]
pre_output = np.zeros((batch_size, code_length))
pre_sum = np.zeros((batch_size, 1))
out = np.zeros((batch_size, 1)).astype("float32")
for i in range(batch_size):
code_table = CodeTable(num_classes, label[i])
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += bias[0][idx]
for i in range(batch_size):
code_table = CodeTable(num_classes, label[i])
length = code_table.get_length()
for j in range(length):
idx = code_table.cal_index(j)
pre_output[i][j] += np.dot(w[idx], x[i])
# clip[-40.0, 40.0]
pre_output = np.clip(pre_output, -40.0, 40.0)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
for i in range(batch_size):
code_table = CodeTable(num_classes, label[i])
length = code_table.get_length()
sum = 0.0
for j in range(length):
if code_table.cal_bit(j):
sum += pre_output[i][j]
out[i] = -1.0 * sum
# soft relu
pre_output = np.log(1 + np.exp(pre_output))
pre_sum = pre_output.sum(1).reshape((batch_size, 1))
out += pre_sum
return pre_output, out
class TestHSigmoidOp(OpTest):
def setUp(self):
self.op_type = "hierarchical_sigmoid"
num_classes = 6
feature_size = 8
batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32")
w = np.random.random((num_classes - 1, feature_size)).astype("float32")
label = np.random.randint(0, num_classes, (batch_size, 1))
bias = np.random.random((1, num_classes - 1)).astype("float32")
self.attrs = {'num_classes': num_classes}
self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
pre_output, out = hsigmoid(x, w, label, bias, num_classes)
self.outputs = {'PreOut': pre_output, 'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
if __name__ == '__main__':
unittest.main()
...@@ -16,20 +16,45 @@ import numpy as np ...@@ -16,20 +16,45 @@ import numpy as np
from op_test import OpTest from op_test import OpTest
def get_output_shape(attrs, in_shape): def get_output_shape(attrs, in_shape, img_real_size):
batchsize = in_shape[0]
img_height = in_shape[2] img_height = in_shape[2]
img_width = in_shape[3] img_width = in_shape[3]
paddings = np.array(attrs['paddings']).astype("int32")
kernels = np.array(attrs['kernels']).astype("int32")
strides = np.array(attrs['strides']).astype("int32")
output_height = np.zeros((1, batchsize)).astype("int32")
output_width = np.zeros((1, batchsize)).astype("int32")
if len(img_real_size):
out_stride = np.array(attrs['out_stride']).astype("int32")
imgreal_h = 0
imgreal_w = 0
for index in range(batchsize):
if img_real_size[index, 0] % out_stride[0] == 0:
imgreal_h = img_real_size[index, 0] / out_stride[0]
else:
imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
if img_real_size[index, 0] % out_stride[1] == 0:
imgreal_w = img_real_size[index, 1] / out_stride[1]
else:
imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
output_height[0,index] = \
1 + \
(imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
strides[0]
paddings = attrs['paddings'] output_width[0,index] = \
kernels = attrs['kernels'] 1 + \
strides = attrs['strides'] (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
strides[1]
output_height = \ else:
for index in range(batchsize):
output_height[0,index] = \
1 + \ 1 + \
(img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \ (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
strides[0] strides[0]
output_width = \ output_width[0,index] = \
1 + \ 1 + \
(img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \ (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
strides[1] strides[1]
...@@ -75,22 +100,25 @@ def im2col(attrs, im, col): ...@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
im_row_offset][im_col_offset] im_row_offset][im_col_offset]
def Im2Sequence(inputs, attrs): def Im2Sequence(inputs, img_real_size, attrs):
output_height, output_width = get_output_shape(attrs, inputs.shape) output_height, output_width = get_output_shape(attrs, inputs.shape,
img_real_size)
img_channels = inputs.shape[1] img_channels = inputs.shape[1]
batch_size = inputs.shape[0] batch_size = inputs.shape[0]
out = np.zeros([ out = []
batch_size, output_height, output_width, img_channels, for index in range(batch_size):
tmp = np.zeros([
output_height[0, index], output_width[0, index], img_channels,
attrs['kernels'][0], attrs['kernels'][1] attrs['kernels'][0], attrs['kernels'][1]
]).astype("float32") ]).astype("float32")
out.append(tmp)
for i in range(len(inputs)): for index in range(len(inputs)):
im2col(attrs, inputs[i], out[i]) im2col(attrs, inputs[index], out[index])
out[index] = out[index].reshape([
out = out.reshape([ output_height[0, index] * output_width[0, index],
batch_size * output_height * output_width,
img_channels * attrs['kernels'][0] * attrs['kernels'][1] img_channels * attrs['kernels'][0] * attrs['kernels'][1]
]) ])
out = np.concatenate(out, axis=0)
return out return out
...@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest): ...@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
self.attrs = { self.attrs = {
'kernels': [2, 2], 'kernels': [2, 2],
'strides': [1, 1], 'strides': [1, 1],
'paddings': [1, 1, 1, 1] 'paddings': [1, 1, 1, 1],
} }
def setUp(self): def setUp(self):
...@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest): ...@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
self.batch_size, self.img_channels, self.img_height, self.img_width self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32") ]).astype("float32")
out = Im2Sequence(x, self.attrs) real_size = np.array([]).astype("float32")
out = Im2Sequence(x, real_size, self.attrs)
self.inputs = {'X': x} self.inputs = {'X': x}
self.outputs = {'Out': out} self.outputs = {'Out': out}
...@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp): ...@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
self.attrs = { self.attrs = {
'kernels': [2, 1], 'kernels': [2, 1],
'strides': [2, 1], 'strides': [2, 1],
'paddings': [2, 1, 2, 1] 'paddings': [2, 1, 2, 1],
} }
class TestBlockExpandOpCase3(TestBlockExpandOp): class TestBlockExpandOpCase3(TestBlockExpandOp):
def config(self): def config(self):
self.batch_size = 3 self.batch_size = 2
self.img_channels = 1 self.img_channels = 1
self.img_height = 4 self.img_height = 4
self.img_width = 5 self.img_width = 5
self.attrs = { self.attrs = {
'kernels': [2, 1], 'kernels': [2, 1],
'strides': [2, 1], 'strides': [2, 1],
'paddings': [2, 0, 2, 0] 'paddings': [2, 0, 2, 0],
} }
...@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp): ...@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
self.attrs = { self.attrs = {
'kernels': [2, 2], 'kernels': [2, 2],
'strides': [1, 1], 'strides': [1, 1],
'paddings': [0, 0, 0, 0] 'paddings': [0, 0, 0, 0],
}
class TestBlockExpandOpCase5(OpTest):
def config(self):
self.batch_size = 1
self.img_channels = 3
self.img_height = 4
self.img_width = 5
self.attrs = {
'kernels': [2, 1],
'strides': [2, 1],
'paddings': [2, 1, 2, 1],
'out_stride': [2, 2],
}
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[8, 10], [5, 8]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size} #l ??
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
class TestBlockExpandOpCase6(OpTest):
def config(self):
self.batch_size = 3
self.img_channels = 1
self.img_height = 4
self.img_width = 5
self.attrs = {
'kernels': [2, 1],
'strides': [1, 1],
'paddings': [0, 0, 0, 0],
'out_stride': [1, 1],
}
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size} #l ??
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
class TestBlockExpandOpCase7(OpTest):
def config(self):
self.batch_size = 2
self.img_channels = 2
self.img_height = 3
self.img_width = 3
self.attrs = {
'kernels': [2, 2],
'strides': [1, 1],
'paddings': [1, 0, 1, 0],
'out_stride': [2, 2],
} }
def setUp(self):
self.config()
self.op_type = "im2sequence"
x = np.random.uniform(0.1, 1, [
self.batch_size, self.img_channels, self.img_height, self.img_width
]).astype("float32")
real_size = np.array([[6, 6], [4, 4]]).astype("float32")
out = np.array(Im2Sequence(x, real_size, self.attrs))
self.inputs = {'X': x, 'Y': real_size}
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
#set shiftwidth=4 set expandtab set tabstop=4
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.layers.device import get_places
import paddle.fluid.nets as nets import paddle.fluid.nets as nets
from paddle.fluid.framework import Program, program_guard, default_main_program from paddle.fluid.framework import Program, program_guard, default_main_program
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
...@@ -173,6 +174,16 @@ class TestBook(unittest.TestCase): ...@@ -173,6 +174,16 @@ class TestBook(unittest.TestCase):
x=dat, label=lbl)) x=dat, label=lbl))
print(str(program)) print(str(program))
def test_hsigmoid(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[2], dtype='float32')
y = layers.data(name='y', shape=[2], dtype='int64')
self.assertIsNotNone(
layers.hsigmoid(
input=x, label=y, num_classes=2))
print(str(program))
def test_sequence_expand(self): def test_sequence_expand(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
...@@ -238,7 +249,7 @@ class TestBook(unittest.TestCase): ...@@ -238,7 +249,7 @@ class TestBook(unittest.TestCase):
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
x = layers.get_places(device_count=4) x = get_places(device_count=4)
self.assertIsNotNone(x) self.assertIsNotNone(x)
print(str(program)) print(str(program))
...@@ -251,12 +262,16 @@ class TestBook(unittest.TestCase): ...@@ -251,12 +262,16 @@ class TestBook(unittest.TestCase):
print(str(program)) print(str(program))
def test_im2sequence(self): def test_im2sequence(self):
print("test_im2sequence")
program = Program() program = Program()
with program_guard(program): with program_guard(program):
x = layers.data(name='x', shape=[3, 128, 128], dtype='float32') x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
y = layers.data(name='y', shape=[], dtype='float32')
output = layers.im2sequence( output = layers.im2sequence(
input=x, stride=[1, 1], filter_size=[2, 2]) input=x,
input_image_size=y,
stride=[1, 1],
filter_size=[2, 2],
out_stride=[1, 1])
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
......
...@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer._create_optimization_pass(
params_grads, mul_out, init_program) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
sgd_op = opts[-1] sgd_op = opts[-1]
...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer._create_optimization_pass(
params_grads, mul_out, init_program) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
sgd_op = opts[-1] sgd_op = opts[-1]
...@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, opts = adagrad_optimizer._create_optimization_pass(
init_program) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "adagrad"]) ["fill_constant", "elementwise_mul", "adagrad"])
...@@ -278,7 +278,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -278,7 +278,7 @@ class TestAdamOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
init_program) init_program)
self.assertEqual(len(opts), 5) self.assertEqual(len(opts), 5)
self.assertEqual( self.assertEqual(
...@@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase):
# Check accumulators # Check accumulators
accumulators = adam_optimizer.get_accumulators() accumulators = adam_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 2) self.assertEqual(len(accumulators), 4)
self.assertTrue(adam_optimizer.get_moment1_str() in accumulators) self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
self.assertTrue(adam_optimizer.get_moment2_str() in accumulators) self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
moment1_acc = accumulators[adam_optimizer.get_moment1_str()] moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
...@@ -345,7 +345,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -345,7 +345,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
init_program) init_program)
self.assertEqual(len(opts), 4) self.assertEqual(len(opts), 4)
self.assertEqual( self.assertEqual(
...@@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
# Check accumulators # Check accumulators
accumulators = adamax_optimizer.get_accumulators() accumulators = adamax_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 2) self.assertEqual(len(accumulators), 3)
self.assertTrue(adamax_optimizer.get_moment_str() in accumulators) self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators) self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
moment_acc = accumulators[adamax_optimizer.get_moment_str()] moment_acc = accumulators[adamax_optimizer.get_moment_str()]
...@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
opts = decayed_adagrad_optimizer.create_optimization_pass( opts = decayed_adagrad_optimizer._create_optimization_pass(
params_grads, mul_out, init_program) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual( self.assertEqual(
...@@ -475,7 +475,7 @@ class TestFtrlOptimizer(unittest.TestCase): ...@@ -475,7 +475,7 @@ class TestFtrlOptimizer(unittest.TestCase):
params_grads = append_backward(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out, opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
init_program) init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
......
...@@ -101,9 +101,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -101,9 +101,7 @@ class TestMNIST(TestParallelExecutorBase):
fluid.recordio_writer.convert_reader_to_recordio_file( fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder) MNIST_RECORDIO_FILE, reader, feeder)
def check_simple_fc_convergence(self, def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
balance_parameter_opt_between_cards,
use_cuda=True):
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence( self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
...@@ -115,20 +113,19 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -115,20 +113,19 @@ class TestMNIST(TestParallelExecutorBase):
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards use_reduce=use_reduce)
)
def test_simple_fc(self): def test_simple_fc(self):
self.check_simple_fc_convergence(False, use_cuda=True) # use_cuda
self.check_simple_fc_convergence(False, use_cuda=False) self.check_simple_fc_convergence(True)
self.check_simple_fc_convergence(False)
def test_simple_fc_with_new_strategy(self): def test_simple_fc_with_new_strategy(self):
self.check_simple_fc_convergence(True, use_cuda=True) # use_cuda, use_reduce
self.check_simple_fc_convergence(True, use_cuda=False) self.check_simple_fc_convergence(True, True)
self.check_simple_fc_convergence(False, True)
def check_simple_fc_parallel_accuracy(self, def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False):
balance_parameter_opt_between_cards,
use_cuda=True):
img = np.zeros(shape=[32, 784], dtype='float32') img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence( single_first_loss, single_last_loss = self.check_network_convergence(
...@@ -145,8 +142,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -145,8 +142,7 @@ class TestMNIST(TestParallelExecutorBase):
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
use_parallel_executor=True, use_parallel_executor=True,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards use_reduce=use_reduce)
)
for p_f in parallel_first_loss: for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
...@@ -154,15 +150,15 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -154,15 +150,15 @@ class TestMNIST(TestParallelExecutorBase):
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_simple_fc_parallel_accuracy(self): def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(False, use_cuda=True) self.check_simple_fc_parallel_accuracy(True)
self.check_simple_fc_parallel_accuracy(False, use_cuda=False) self.check_simple_fc_parallel_accuracy(False)
def test_simple_fc_parallel_accuracy_with_new_strategy(self): def test_simple_fc_parallel_accuracy_with_new_strategy(self):
self.check_simple_fc_parallel_accuracy(True, use_cuda=True) # use_cuda, use_reduce
self.check_simple_fc_parallel_accuracy(True, use_cuda=False) self.check_simple_fc_parallel_accuracy(True, True)
self.check_simple_fc_parallel_accuracy(False, True)
def check_batchnorm_fc_convergence( def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False):
self, balance_parameter_opt_between_cards, use_cuda):
self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
img = np.zeros(shape=[32, 784], dtype='float32') img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64') label = np.ones(shape=[32, 1], dtype='int64')
...@@ -171,16 +167,16 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -171,16 +167,16 @@ class TestMNIST(TestParallelExecutorBase):
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda, use_cuda=use_cuda,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards use_reduce=use_reduce)
)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(False, use_cuda=True) self.check_batchnorm_fc_convergence(True)
self.check_batchnorm_fc_convergence(False, use_cuda=False) self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
self.check_batchnorm_fc_convergence(True, use_cuda=True) # use_cuda, use_reduce
self.check_batchnorm_fc_convergence(True, use_cuda=False) self.check_batchnorm_fc_convergence(True, True)
self.check_batchnorm_fc_convergence(False, True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -13,8 +13,12 @@ ...@@ -13,8 +13,12 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers.ops as ops
from paddle.fluid.initializer import init_on_cpu
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from parallel_executor_test_base import TestParallelExecutorBase from parallel_executor_test_base import TestParallelExecutorBase
import unittest import unittest
import math
import os import os
...@@ -131,30 +135,71 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): ...@@ -131,30 +135,71 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
class TestResnet(TestParallelExecutorBase): class TestResnet(TestParallelExecutorBase):
def check_resnet_convergence(self, def check_resnet_convergence_with_learning_rate_decay(self,
balance_parameter_opt_between_cards,
use_cuda=True, use_cuda=True,
use_reduce=False,
iter=20): iter=20):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
"""
Applies cosine decay to the learning rate.
lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
"""
global_step = _decay_step_counter()
with init_on_cpu():
epoch = ops.floor(global_step / step_each_epoch)
decayed_lr = learning_rate * \
(ops.cos(epoch * (math.pi / epochs)) + 1)/2
return decayed_lr
def _optimizer(learning_rate=0.01):
optimizer = fluid.optimizer.Momentum(
learning_rate=_cosine_decay(
learning_rate=learning_rate, step_each_epoch=2, epochs=1),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
import functools import functools
batch_size = 2 batch_size = 2
self.check_network_convergence(
single_first_loss, single_last_loss = self.check_network_convergence(
functools.partial( functools.partial(
SE_ResNeXt50Small, batch_size=batch_size), SE_ResNeXt50Small, batch_size=batch_size),
iter=iter, iter=iter,
batch_size=batch_size, batch_size=batch_size,
use_cuda=use_cuda, use_cuda=use_cuda,
balance_parameter_opt_between_cards=balance_parameter_opt_between_cards use_reduce=use_reduce,
) optimizer=_optimizer,
use_parallel_executor=False)
def test_resnet(self):
self.check_resnet_convergence(False, use_cuda=True)
self.check_resnet_convergence(False, use_cuda=False, iter=5)
def test_resnet_with_new_strategy(self): parallel_first_loss, parallel_last_loss = self.check_network_convergence(
self.check_resnet_convergence(True, use_cuda=True) functools.partial(
self.check_resnet_convergence(True, use_cuda=False, iter=5) SE_ResNeXt50Small, batch_size=batch_size),
iter=iter,
batch_size=batch_size,
use_cuda=use_cuda,
use_reduce=use_reduce,
optimizer=_optimizer)
for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
for p_l in parallel_last_loss:
self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
def test_seresnext_with_learning_rate_decay(self):
self.check_resnet_convergence_with_learning_rate_decay(True, False)
self.check_resnet_convergence_with_learning_rate_decay(
False, False, iter=5)
def test_seresnext_with_new_strategy_with_learning_rate_decay(self):
self.check_resnet_convergence_with_learning_rate_decay(True, True)
self.check_resnet_convergence_with_learning_rate_decay(
False, True, iter=5)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.layers.device import get_places
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
import numpy import numpy
...@@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase):
if use_parallel: if use_parallel:
thread_num = fluid.core.get_cuda_device_count( thread_num = fluid.core.get_cuda_device_count(
) if use_gpu else 8 ) if use_gpu else 8
places = fluid.layers.get_places(thread_num) places = get_places(thread_num)
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
data = next(generator) data = next(generator)
......
...@@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest): ...@@ -32,6 +32,7 @@ class TestPriorBoxOp(OpTest):
'variances': self.variances, 'variances': self.variances,
'flip': self.flip, 'flip': self.flip,
'clip': self.clip, 'clip': self.clip,
'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
'step_w': self.step_w, 'step_w': self.step_w,
'step_h': self.step_h, 'step_h': self.step_h,
'offset': self.offset 'offset': self.offset
...@@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest): ...@@ -52,6 +53,9 @@ class TestPriorBoxOp(OpTest):
max_sizes = [5, 10] max_sizes = [5, 10]
self.max_sizes = np.array(max_sizes).astype('float32').tolist() self.max_sizes = np.array(max_sizes).astype('float32').tolist()
def set_min_max_aspect_ratios_order(self):
self.min_max_aspect_ratios_order = False
def init_test_params(self): def init_test_params(self):
self.layer_w = 32 self.layer_w = 32
self.layer_h = 32 self.layer_h = 32
...@@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest): ...@@ -71,6 +75,7 @@ class TestPriorBoxOp(OpTest):
self.set_max_sizes() self.set_max_sizes()
self.aspect_ratios = [2.0, 3.0] self.aspect_ratios = [2.0, 3.0]
self.flip = True self.flip = True
self.set_min_max_aspect_ratios_order()
self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
self.aspect_ratios = np.array( self.aspect_ratios = np.array(
self.aspect_ratios, dtype=np.float).flatten() self.aspect_ratios, dtype=np.float).flatten()
...@@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest): ...@@ -78,7 +83,6 @@ class TestPriorBoxOp(OpTest):
self.variances = np.array(self.variances, dtype=np.float).flatten() self.variances = np.array(self.variances, dtype=np.float).flatten()
self.clip = True self.clip = True
self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
if len(self.max_sizes) > 0: if len(self.max_sizes) > 0:
self.num_priors += len(self.max_sizes) self.num_priors += len(self.max_sizes)
...@@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest): ...@@ -106,26 +110,60 @@ class TestPriorBoxOp(OpTest):
idx = 0 idx = 0
for s in range(len(self.min_sizes)): for s in range(len(self.min_sizes)):
min_size = self.min_sizes[s] min_size = self.min_sizes[s]
if not self.min_max_aspect_ratios_order:
# rest of priors # rest of priors
for r in range(len(self.real_aspect_ratios)): for r in range(len(self.real_aspect_ratios)):
ar = self.real_aspect_ratios[r] ar = self.real_aspect_ratios[r]
c_w = min_size * math.sqrt(ar) / 2 c_w = min_size * math.sqrt(ar) / 2
c_h = (min_size / math.sqrt(ar)) / 2 c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, out_boxes[h, w, idx, :] = [
(c_y - c_h) / self.image_h, (c_x - c_w) / self.image_w, (c_y - c_h) /
(c_x + c_w) / self.image_w, self.image_h, (c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h] (c_y + c_h) / self.image_h
]
idx += 1 idx += 1
if len(self.max_sizes) > 0: if len(self.max_sizes) > 0:
max_size = self.max_sizes[s] max_size = self.max_sizes[s]
# second prior: aspect_ratio = 1, # second prior: aspect_ratio = 1,
c_w = c_h = math.sqrt(min_size * max_size) / 2 c_w = c_h = math.sqrt(min_size * max_size) / 2
out_boxes[h, w, idx, :] = [
(c_x - c_w) / self.image_w, (c_y - c_h) /
self.image_h, (c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h
]
idx += 1
else:
c_w = c_h = min_size / 2.
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
(c_y - c_h) / self.image_h, (c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w, (c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h] (c_y + c_h) / self.image_h]
idx += 1 idx += 1
if len(self.max_sizes) > 0:
max_size = self.max_sizes[s]
# second prior: aspect_ratio = 1,
c_w = c_h = math.sqrt(min_size * max_size) / 2
out_boxes[h, w, idx, :] = [
(c_x - c_w) / self.image_w, (c_y - c_h) /
self.image_h, (c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h
]
idx += 1
# rest of priors
for r in range(len(self.real_aspect_ratios)):
ar = self.real_aspect_ratios[r]
if abs(ar - 1.) < 1e-6:
continue
c_w = min_size * math.sqrt(ar) / 2
c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [
(c_x - c_w) / self.image_w, (c_y - c_h) /
self.image_h, (c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h
]
idx += 1
# clip the prior's coordidate such that it is within[0, 1] # clip the prior's coordidate such that it is within[0, 1]
if self.clip: if self.clip:
...@@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest): ...@@ -137,10 +175,15 @@ class TestPriorBoxOp(OpTest):
self.out_var = out_var.astype('float32') self.out_var = out_var.astype('float32')
class TestPriorBoxOpWithMaxSize(TestPriorBoxOp): class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
def set_max_sizes(self): def set_max_sizes(self):
self.max_sizes = [] self.max_sizes = []
class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
def set_min_max_aspect_ratios_order(self):
self.min_max_aspect_ratios_order = True
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -181,13 +181,13 @@ class TestBlockDesc(unittest.TestCase): ...@@ -181,13 +181,13 @@ class TestBlockDesc(unittest.TestCase):
self.assertIsNotNone(block) self.assertIsNotNone(block)
op1 = block.append_op() op1 = block.append_op()
op2 = block.append_op() op2 = block.append_op()
op0 = block.prepend_op() op0 = block._prepend_op()
all_ops = [] all_ops = []
for idx in xrange(0, block.op_size()): for idx in xrange(0, block.op_size()):
all_ops.append(block.op(idx)) all_ops.append(block.op(idx))
self.assertEqual(all_ops, [op0, op1, op2]) self.assertEqual(all_ops, [op0, op1, op2])
def test_remove_op(self): def test__remove_op(self):
program = Program() program = Program()
program_desc = program.desc program_desc = program.desc
self.assertIsNotNone(program_desc) self.assertIsNotNone(program_desc)
...@@ -201,8 +201,8 @@ class TestBlockDesc(unittest.TestCase): ...@@ -201,8 +201,8 @@ class TestBlockDesc(unittest.TestCase):
op1.set_type("test") op1.set_type("test")
op2.set_type("test") op2.set_type("test")
block.remove_op(1, 2) block._remove_op(1, 2)
program.sync_with_cpp() program._sync_with_cpp()
all_ops = [] all_ops = []
for idx in xrange(0, block.op_size()): for idx in xrange(0, block.op_size()):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import numpy as np
from threading import Thread
def feed_data(feed_queue, inputs):
for in_data in inputs:
feed_queue.push(in_data)
class TestPyReader(unittest.TestCase):
def setUp(self):
self.capacity = 10
self.batch_size_min = 10
self.batch_size_max = 20
self.shapes = [(-1, 3, 2, 1), (-1, 1)]
self.lod_levels = [0, 0]
self.dtypes = ['float32', 'int64']
self.iterations = 20
def test_single_thread_main(self):
self.main(use_thread=False)
def test_multiple_thread_main(self):
self.main(use_thread=True)
def main(self, use_thread=False):
with fluid.program_guard(fluid.Program(), fluid.Program()):
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
) else fluid.CPUPlace()
executor = fluid.Executor(place)
data_file, feed_queue = fluid.layers.py_reader(
capacity=self.capacity,
dtypes=self.dtypes,
lod_levels=self.lod_levels,
shapes=self.shapes)
read_out_data = fluid.layers.read_file(data_file)
self.inputs = []
for i in range(self.iterations):
in_data = fluid.LoDTensorArray()
batch_size = np.random.random_integers(self.batch_size_min,
self.batch_size_max)
for shape, dtype in zip(self.shapes, self.dtypes):
next_data = np.random.uniform(
low=0, high=1000,
size=(batch_size, ) + shape[1:]).astype(dtype)
in_data.append(executor.as_lodtensor(next_data))
self.inputs.append(in_data)
executor.run(fluid.default_startup_program())
self.outputs = []
if use_thread:
thread = Thread(
target=feed_data, args=(feed_queue, self.inputs))
thread.start()
for in_data in self.inputs:
self.outputs.append(
executor.run(fetch_list=list(read_out_data)))
else:
for in_data in self.inputs:
feed_queue.push(in_data)
self.outputs.append(
executor.run(fetch_list=list(read_out_data)))
feed_queue.close()
self.validate()
def validate(self):
self.assertEqual(len(self.inputs), len(self.outputs))
for in_data_list, out_data_list in zip(self.inputs, self.outputs):
self.assertEqual(len(in_data_list), len(out_data_list))
in_data_list_np = [
np.array(in_lod_tensor) for in_lod_tensor in in_data_list
]
for in_data, out_data in zip(in_data_list_np, out_data_list):
self.assertTrue((in_data == out_data).all())
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import threading
import multiprocessing
import os
def as_tensor(np_array_or_tensor, place=None):
if isinstance(np_array_or_tensor, fluid.LoDTensor):
return np_array_or_tensor
if place is None:
place = fluid.CPUPlace()
tensor = fluid.LoDTensor()
tensor.set(np_array_or_tensor, place)
return tensor
def as_numpy(tensor_or_numpy):
return tensor_or_numpy if isinstance(
tensor_or_numpy, np.ndarray) else np.array(tensor_or_numpy)
def feed_data(feed_queue, reader):
data_generator = reader()
while True:
data = next(data_generator, None)
if data is None or not feed_queue.push(data):
break
def simple_fc_net(in_size,
class_num,
hidden_sizes,
batch_size,
queue_capacity,
use_double_buffer=False):
reader, feed_queue = fluid.layers.py_reader(
capacity=queue_capacity,
shapes=[[-1, in_size], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.batch(reader, batch_size=batch_size)
if use_double_buffer:
reader = fluid.layers.double_buffer(reader)
in_data, label = fluid.layers.read_file(reader)
hidden = in_data
for hidden_size in hidden_sizes:
hidden = fluid.layers.fc(
hidden,
size=hidden_size,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
loss = fluid.layers.mean(
fluid.layers.cross_entropy(
input=predict_label, label=label))
optimizer = fluid.optimizer.Adam()
optimizer.minimize(loss)
return in_data, label, loss, optimizer, feed_queue
class TestPyReaderUsingExecutor(unittest.TestCase):
def setUp(self):
self.in_size = 1000
self.hidden_sizes = [50, 30, 20]
self.class_num = 10
self.batch_size = 32
self.iterations = 10
self.queue_capacity = 50
def test(self):
for use_cuda in [False, True]:
for use_parallel_executor in [False, True]:
for use_double_buffer in [False, True]:
print('Test Parameters:'),
print({
'use_cuda': use_cuda,
'use_parallel_executor': use_parallel_executor,
'use_double_buffer': use_double_buffer
})
self.main(use_cuda, use_parallel_executor,
use_double_buffer)
def random_reader(self):
def reader():
self.inputs = []
cnt = 0
while True:
tensors = fluid.LoDTensorArray()
in_data = np.random.uniform(
low=0, high=1, size=(1, self.in_size)).astype('float32')
tensors.append(as_tensor(in_data))
label = np.random.random_integers(
low=0, high=self.class_num - 1, size=(1, 1)).astype('int64')
tensors.append(as_tensor(label))
if cnt < self.iterations * self.batch_size * self.batch_size_times:
if cnt % (self.batch_size * self.batch_size_times) == 0:
self.inputs.append([in_data, label])
else:
self.inputs[-1][0] = np.concatenate(
(self.inputs[-1][0], in_data), axis=0)
self.inputs[-1][1] = np.concatenate(
(self.inputs[-1][1], label), axis=0)
elif not self.use_double_buffer:
break
yield tensors
cnt += 1
yield None
return reader
def main(self,
use_cuda=True,
use_parallel_executor=False,
use_double_buffer=False):
assert not use_cuda or use_cuda and core.is_compiled_with_cuda()
self.use_cuda = use_cuda
self.use_parallel_executor = use_parallel_executor
self.use_double_buffer = use_double_buffer
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
in_data, label, loss, optimizer, feed_queue = simple_fc_net(
in_size=self.in_size,
class_num=self.class_num,
hidden_sizes=self.hidden_sizes,
batch_size=self.batch_size,
queue_capacity=self.queue_capacity,
use_double_buffer=self.use_double_buffer)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
startup_exe = fluid.Executor(place)
startup_exe.run(startup_program)
if use_parallel_executor:
main_exe = fluid.ParallelExecutor(use_cuda, loss_name=loss.name)
if use_cuda:
self.batch_size_times = core.get_cuda_device_count()
else:
self.batch_size_times = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
else:
main_exe = startup_exe
self.batch_size_times = 1
reader = self.random_reader()
thread = threading.Thread(
target=feed_data, args=(feed_queue, reader))
thread.start()
self.outputs = []
for _ in range(self.iterations):
fetches = main_exe.run(fetch_list=[in_data.name, label.name])
fetches = [as_numpy(fetch) for fetch in fetches]
self.outputs.append(fetches)
feed_queue.close()
self.validate()
def validate(self):
self.assertEqual(len(self.inputs), len(self.outputs))
for batch_in, batch_out in zip(self.inputs, self.outputs):
self.assertEqual(len(batch_in), len(batch_out))
if self.use_parallel_executor and not self.use_double_buffer:
self.validate_unordered_batch(batch_in, batch_out)
else:
for in_data, out_data in zip(batch_in, batch_out):
self.assertEqual(in_data.shape, out_data.shape)
if not self.use_parallel_executor:
self.assertTrue((in_data == out_data).all())
def validate_unordered_batch(self, batch_in, batch_out):
out_index_left_set = set(range(self.batch_size * self.batch_size_times))
mapping_num = 0
for i in range(self.batch_size * self.batch_size_times):
for j in out_index_left_set:
flag = True
for k in range(len(batch_in)):
in_data = batch_in[k][i]
out_data = batch_out[k][j]
if (in_data != out_data).any():
flag = False
break
if flag:
out_index_left_set.remove(j)
mapping_num += 1
break
self.assertEqual(mapping_num, self.batch_size * self.batch_size_times)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle.v2 as paddle
import numpy as np
import unittest
class TestReaderReset(unittest.TestCase):
def prepare_data(self):
def fake_data_generator():
for n in xrange(self.total_ins_num):
yield np.ones(self.ins_shape) * n, n
# Prepare data
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(fake_data_generator, batch_size=1)
feeder = fluid.DataFeeder(
feed_list=[
fluid.layers.data(
name='data', shape=[3], dtype='float32'),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
self.data_file_name, reader, feeder)
def setUp(self):
self.use_cuda = fluid.core.is_compiled_with_cuda()
self.data_file_name = './reader_reset_test.recordio'
self.ins_shape = [3]
self.batch_size = 5
self.total_ins_num = self.batch_size * 20
self.test_pass_num = 100
self.prepare_data()
def main(self, with_double_buffer):
main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(main_prog, startup_prog):
data_reader_handle = fluid.layers.io.open_files(
filenames=[self.data_file_name],
shapes=[[-1] + self.ins_shape, [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
pass_num=1)
data_reader = fluid.layers.io.batch(data_reader_handle,
self.batch_size)
if with_double_buffer:
data_reader = fluid.layers.double_buffer(data_reader)
image, label = fluid.layers.read_file(data_reader)
fetch_list = [image.name, label.name]
place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
build_strategy = fluid.BuildStrategy()
if with_double_buffer:
build_strategy.enable_data_balance = True
exec_strategy = fluid.ExecutionStrategy()
parallel_exe = fluid.ParallelExecutor(
use_cuda=self.use_cuda,
main_program=main_prog,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
data_appeared = [False] * self.total_ins_num
pass_count = 0
while (True):
try:
data_val, label_val = parallel_exe.run(fetch_list,
return_numpy=True)
ins_num = data_val.shape[0]
broadcasted_label = np.ones((ins_num, ) + tuple(
self.ins_shape)) * label_val.reshape((ins_num, 1))
self.assertEqual(data_val.all(), broadcasted_label.all())
for l in label_val:
self.assertFalse(data_appeared[l[0]])
data_appeared[l[0]] = True
except fluid.core.EOFException:
pass_count += 1
if with_double_buffer:
data_appeared = data_appeared[:-parallel_exe.device_count *
self.batch_size]
for i in data_appeared:
self.assertTrue(i)
if pass_count < self.test_pass_num:
data_appeared = [False] * self.total_ins_num
data_reader_handle.reset()
else:
break
def test_all(self):
self.main(with_double_buffer=False)
self.main(with_double_buffer=True)
if __name__ == '__main__':
unittest.main()
...@@ -40,12 +40,12 @@ class TestSelectedRows(unittest.TestCase): ...@@ -40,12 +40,12 @@ class TestSelectedRows(unittest.TestCase):
# compare tensor # compare tensor
self.assertAlmostEqual(2.0, self.assertAlmostEqual(2.0,
selected_rows.get_tensor().get_float_element(0)) selected_rows.get_tensor()._get_float_element(0))
self.assertAlmostEqual(1.0, self.assertAlmostEqual(1.0,
selected_rows.get_tensor().get_float_element(1)) selected_rows.get_tensor()._get_float_element(1))
self.assertAlmostEqual( self.assertAlmostEqual(
4.0, 4.0,
selected_rows.get_tensor().get_float_element(2 * row_numel + 8)) selected_rows.get_tensor()._get_float_element(2 * row_numel + 8))
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -45,8 +45,8 @@ class TestShrinkRNNMemoryBase(unittest.TestCase): ...@@ -45,8 +45,8 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
def sum_lodtensor(self, tensor): def sum_lodtensor(self, tensor):
sum_res = 0.0 sum_res = 0.0
for i in xrange(np.product(tensor.get_dims())): for i in xrange(np.product(tensor.shape())):
sum_res += tensor.get_float_element(i) sum_res += tensor._get_float_element(i)
return sum_res return sum_res
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
# Correct: General.
class TestSqueezeOp(OpTest):
def setUp(self):
self.op_type = "squeeze"
self.init_test_case()
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, 2)
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": False}
# Correct: There is mins axis.
class TestSqueezeOp1(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, -2)
self.new_shape = (3, 5)
# Correct: No axes input.
class TestSqueezeOp2(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = ()
self.new_shape = (3, 5)
# Correct: Just part of axes be squeezed.
class TestSqueezeOp3(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 1, 5, 1, 4, 1)
self.axes = (1, -1)
self.new_shape = (3, 5, 1, 4)
# Correct: Inplace.
class TestSqueezeOpInplace1(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, 2)
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is mins axis.
class TestSqueezeOpInplace2(TestSqueezeOp):
def inti_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, -2)
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. No axes input.
class TestSqueezeOpInplace3(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = ()
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inpalce. Just part of axes be squeezed.
class TestSqueezeOpInplace4(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 1, 5, 1, 4, 1)
self.axes = (1, -1)
self.new_shape = (3, 5, 1, 4)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
if __name__ == "__main__":
unittest.main()
...@@ -25,8 +25,8 @@ class TestTensor(unittest.TestCase): ...@@ -25,8 +25,8 @@ class TestTensor(unittest.TestCase):
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set_dims([1000, 784]) tensor._set_dims([1000, 784])
tensor.alloc_int(place) tensor._alloc_int(place)
tensor_array = numpy.array(tensor) tensor_array = numpy.array(tensor)
self.assertEqual((1000, 784), tensor_array.shape) self.assertEqual((1000, 784), tensor_array.shape)
tensor_array[3, 9] = 1 tensor_array[3, 9] = 1
...@@ -44,8 +44,8 @@ class TestTensor(unittest.TestCase): ...@@ -44,8 +44,8 @@ class TestTensor(unittest.TestCase):
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set_dims([1000, 784]) tensor._set_dims([1000, 784])
tensor.alloc_float(place) tensor._alloc_float(place)
tensor_array = numpy.array(tensor) tensor_array = numpy.array(tensor)
self.assertEqual((1000, 784), tensor_array.shape) self.assertEqual((1000, 784), tensor_array.shape)
...@@ -63,8 +63,8 @@ class TestTensor(unittest.TestCase): ...@@ -63,8 +63,8 @@ class TestTensor(unittest.TestCase):
var_lod = scope.var("test_lod_tensor") var_lod = scope.var("test_lod_tensor")
lod_tensor = var_lod.get_tensor() lod_tensor = var_lod.get_tensor()
lod_tensor.set_dims([4, 4, 6]) lod_tensor._set_dims([4, 4, 6])
lod_tensor.alloc_int(place) lod_tensor._alloc_int(place)
array = numpy.array(lod_tensor) array = numpy.array(lod_tensor)
array[0, 0, 0] = 3 array[0, 0, 0] = 3
array[3, 3, 5] = 10 array[3, 3, 5] = 10
...@@ -84,8 +84,8 @@ class TestTensor(unittest.TestCase): ...@@ -84,8 +84,8 @@ class TestTensor(unittest.TestCase):
var_lod = scope.var("test_lod_tensor") var_lod = scope.var("test_lod_tensor")
lod_tensor = var_lod.get_tensor() lod_tensor = var_lod.get_tensor()
lod_tensor.set_dims([5, 2, 3, 4]) lod_tensor._set_dims([5, 2, 3, 4])
lod_tensor.alloc_float(place) lod_tensor._alloc_float(place)
tensor_array = numpy.array(lod_tensor) tensor_array = numpy.array(lod_tensor)
self.assertEqual((5, 2, 3, 4), tensor_array.shape) self.assertEqual((5, 2, 3, 4), tensor_array.shape)
...@@ -104,14 +104,13 @@ class TestTensor(unittest.TestCase): ...@@ -104,14 +104,13 @@ class TestTensor(unittest.TestCase):
self.assertListEqual(lod_py, lod) self.assertListEqual(lod_py, lod)
def test_lod_tensor_init(self): def test_lod_tensor_init(self):
scope = core.Scope()
place = core.CPUPlace() place = core.CPUPlace()
lod_py = [[2, 1], [1, 2, 2]] lod_py = [[2, 1], [1, 2, 2]]
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4]) lod_tensor._set_dims([5, 2, 3, 4])
lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor.set_recursive_sequence_lengths(lod_py)
lod_tensor.alloc_float(place) lod_tensor._alloc_float(place)
tensor_array = numpy.array(lod_tensor) tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 0] = 1.0
tensor_array[0, 0, 0, 1] = 2.0 tensor_array[0, 0, 0, 1] = 2.0
...@@ -129,9 +128,9 @@ class TestTensor(unittest.TestCase): ...@@ -129,9 +128,9 @@ class TestTensor(unittest.TestCase):
lod_py = [[2, 1], [1, 2, 2]] lod_py = [[2, 1], [1, 2, 2]]
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4]) lod_tensor._set_dims([5, 2, 3, 4])
lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor.set_recursive_sequence_lengths(lod_py)
lod_tensor.alloc_float(place) lod_tensor._alloc_float(place)
tensor_array = numpy.array(lod_tensor) tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 0] = 1.0
tensor_array[0, 0, 0, 1] = 2.0 tensor_array[0, 0, 0, 1] = 2.0
...@@ -149,15 +148,15 @@ class TestTensor(unittest.TestCase): ...@@ -149,15 +148,15 @@ class TestTensor(unittest.TestCase):
tensor = var.get_tensor() tensor = var.get_tensor()
tensor.set_dims([0, 1]) tensor._set_dims([0, 1])
tensor.alloc_float(place) tensor._alloc_float(place)
tensor_array = numpy.array(tensor) tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape) self.assertEqual((0, 1), tensor_array.shape)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
gpu_place = core.CUDAPlace(0) gpu_place = core.CUDAPlace(0)
tensor.alloc_float(gpu_place) tensor._alloc_float(gpu_place)
tensor_array = numpy.array(tensor) tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape) self.assertEqual((0, 1), tensor_array.shape)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
# Correct: General.
class TestUnsqueezeOp(OpTest):
def setUp(self):
self.init_test_case()
self.op_type = "unsqueeze"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (1, 2)
self.new_shape = (3, 1, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": False}
# Correct: Single input index.
class TestUnsqueezeOp1(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (-1, )
self.new_shape = (3, 5, 1)
# Correct: Mixed input axis.
class TestUnsqueezeOp2(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (0, -1)
self.new_shape = (1, 3, 5, 1)
# Correct: There is duplicated axis.
class TestUnsqueezeOp3(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 2, 5)
self.axes = (0, 3, 3)
self.new_shape = (1, 3, 2, 1, 1, 5)
# Correct: Reversed axes.
class TestUnsqueezeOp4(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 2, 5)
self.axes = (3, 1, 1)
self.new_shape = (3, 1, 1, 2, 5, 1)
# Correct: Inplace.
class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (0, 2)
self.new_shape = (1, 3, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is mins index.
class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (0, -2)
self.new_shape = (1, 3, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is duplicated axis.
class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 2, 5)
self.axes = (0, 3, 3)
self.new_shape = (1, 3, 2, 1, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
if __name__ == "__main__":
unittest.main()
...@@ -75,7 +75,7 @@ def set_input(scope, op, inputs, place): ...@@ -75,7 +75,7 @@ def set_input(scope, op, inputs, place):
if isinstance(var, tuple): if isinstance(var, tuple):
tensor.set_recursive_sequence_lengths(var[1]) tensor.set_recursive_sequence_lengths(var[1])
var = var[0] var = var[0]
tensor.set_dims(var.shape) tensor._set_dims(var.shape)
tensor.set(var, place) tensor.set(var, place)
elif isinstance(var, float): elif isinstance(var, float):
scope.find_var(var_name).set_float(var) scope.find_var(var_name).set_float(var)
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
import contextlib import contextlib
import os import os
import errno
import shutil
import time
import core import core
...@@ -94,7 +97,7 @@ class EndStepEvent(object): ...@@ -94,7 +97,7 @@ class EndStepEvent(object):
class CheckpointConfig(object): class CheckpointConfig(object):
""" """
Parameter object for :code:`fluid.io.save_checkpoint` and Parameter object for :code:`save_checkpoint` and
:code:`fluid.Trainer`. Used to configuration how to save checkpoint. :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
Args: Args:
...@@ -237,7 +240,7 @@ class Trainer(object): ...@@ -237,7 +240,7 @@ class Trainer(object):
self.checkpoint_cfg = checkpoint_config self.checkpoint_cfg = checkpoint_config
if self.checkpoint_cfg: if self.checkpoint_cfg:
assert isinstance(self.checkpoint_cfg, CheckpointConfig) assert isinstance(self.checkpoint_cfg, CheckpointConfig)
serial = io.get_latest_checkpoint_serial( serial = _get_latest_checkpoint_serial(
self.checkpoint_cfg.checkpoint_dir) self.checkpoint_cfg.checkpoint_dir)
self.checkpoint_cfg.load_serial = serial if serial >= 0 else None self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
...@@ -276,32 +279,15 @@ class Trainer(object): ...@@ -276,32 +279,15 @@ class Trainer(object):
exe = executor.Executor(place) exe = executor.Executor(place)
exe.run(self.startup_program) exe.run(self.startup_program)
if self.checkpoint_cfg and self.checkpoint_cfg.load_serial: if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
with self._prog_and_scope_guard(): self._load_checkpoint()
exe = executor.Executor(place)
io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
self.checkpoint_cfg.load_serial,
self.startup_program)
if not self.checkpoint_cfg.pserver_id:
epoch_id, step_id = io.load_trainer_args(
self.checkpoint_cfg.checkpoint_dir,
self.checkpoint_cfg.load_serial, self.trainer_id,
self._get_checkpoint_load_args())
self.checkpoint_cfg.epoch_id = int(epoch_id)
self.checkpoint_cfg.step_id = int(step_id)
else:
if self.checkpoint_cfg.lookup_table_name:
io.load_lookup_table_vars(
exe, self.checkpoint_cfg.checkpoint_dir,
self.startup_program,
self.checkpoint_cfg.pserver_id,
self.checkpoint_cfg.lookup_table_name)
if param_path and os.path.isdir(param_path): if param_path and os.path.isdir(param_path):
# load params from param_path into scope # load params from param_path into scope
io.load_persist_vars_without_grad( io.load_persistables(
exe, dirname=param_path, program=self.startup_program) executor=exe,
dirname=param_path,
main_program=self.startup_program)
def _transpile_nccl2_dist(self): def _transpile_nccl2_dist(self):
# PADDLE_TRAINER_IPS # PADDLE_TRAINER_IPS
...@@ -549,7 +535,7 @@ class Trainer(object): ...@@ -549,7 +535,7 @@ class Trainer(object):
def _clean_checkpoint(self): def _clean_checkpoint(self):
assert self.checkpoint_cfg assert self.checkpoint_cfg
io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir) clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
def _get_checkpoint_load_args(self): def _get_checkpoint_load_args(self):
""" """
...@@ -572,7 +558,7 @@ class Trainer(object): ...@@ -572,7 +558,7 @@ class Trainer(object):
if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \ if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
and step_id % self.checkpoint_cfg.step_interval == 0: and step_id % self.checkpoint_cfg.step_interval == 0:
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
io.save_checkpoint( save_checkpoint(
executor=exe, executor=exe,
checkpoint_dir=self.checkpoint_cfg.checkpoint_dir, checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
trainer_id=self.trainer_id, trainer_id=self.trainer_id,
...@@ -580,6 +566,41 @@ class Trainer(object): ...@@ -580,6 +566,41 @@ class Trainer(object):
main_program=self.train_program, main_program=self.train_program,
max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints) max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
def _load_checkpoint(self):
with self._prog_and_scope_guard():
exe = executor.Executor(self.place)
load_checkpoint(
executor=exe,
checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
main_program=self.startup_program)
if not self.checkpoint_cfg.pserver_id:
load_trainer_args = self._get_checkpoint_load_args()
trainer_args = load_checkpoint(
executor=exe,
checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
main_program=self.startup_program,
role_id=self.trainer_id,
is_trainer=True,
load_trainer_args=load_trainer_args)
if len(trainer_args) != 2:
raise ValueError(
"the return trainer_args length do not equal _get_checkpoint_load_args"
)
self.checkpoint_cfg.epoch_id = int(trainer_args[0])
self.checkpoint_cfg.step_id = int(trainer_args[1])
else:
if self.checkpoint_cfg.lookup_table_name:
load_checkpoint(
executor=exe,
checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
main_program=self.startup_program,
role_id=self.checkpoint_cfg.pserver_id,
is_trainer=False,
load_trainer_args=None,
load_lookup_table=self.checkpoint_cfg.lookup_table_name)
def build_feed_var_list(program, feed_order): def build_feed_var_list(program, feed_order):
if not isinstance(program, framework.Program): if not isinstance(program, framework.Program):
...@@ -602,3 +623,610 @@ def build_feed_var_list(program, feed_order): ...@@ -602,3 +623,610 @@ def build_feed_var_list(program, feed_order):
program.global_block().var(pair[0]) for pair in sorted_pair_list program.global_block().var(pair[0]) for pair in sorted_pair_list
] ]
return feed_var_list return feed_var_list
# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
SUCCESS_MARK_FILENAME = "_SUCCESS"
CHECKPOINT_PREFIX = "checkpoint"
MODEL_DIR = "__model__"
LOOKUP_TABLE_DIR = "__lookup_table__"
TRAINER_PREFIX = "trainer"
CHECKPOINT_SEPARATOR = "_"
def save_checkpoint(executor,
checkpoint_dir,
trainer_id,
main_program,
trainer_args=None,
max_num_checkpoints=3,
lookup_table=None,
pserver_endpoints=None):
"""
This function filters out all checkpoint variables from the give
main_program and then saves these variables to the `checkpoint_dir`
directory.
In the training precess, we generally save a checkpoint in each
iteration. So there might be a lot of checkpoints in the
`checkpoint_dir`. To avoid them taking too much disk space, the
`max_num_checkpoints` are introduced to limit the total number of
checkpoints. If the number of existing checkpints is greater than
the `max_num_checkpoints`, oldest ones will be scroll deleted.
A variable is a checkpoint variable and will be saved if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for save checkpoint.
checkpoint_dir(str): The folder where to save checkpoints.
trainer_id(int): currect trainer id, if id is equal to 0, the trainer
is chief.
trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
and 'step_id'.
Defaut: None
main_program(Program): The program whose checkpoint variables will
be saved.
max_num_checkpoints(int): The max number of total number of existing
checkpoints.
Default: 3
lookup_table(string|None): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
pserver_endpoints(list|None): the parameter server ip:port list.
when use distribute lookup table, we can get pserver_endpoints by
distribute arguments.
Returns:
None
Raises:
ValueError: If `checkpoint_dir` is None.
AssertionError: If `trainer_args` is not a dict.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
trainer_args = {"epoch_id": 200,
"step_id": 20} # just an example
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
save_checkpoint(executor=exe,
checkpoint_dir=path,
trainer_id=0,
trainer_args=trainer_args,
main_program=prog,
max_num_checkpoints=3,
lookup_table=table_name,
pserver_endpoints = ps_endpoints)
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
if main_program is None:
raise ValueError('main_program should not be None.')
if trainer_args:
assert isinstance(trainer_args, dict)
is_chief = trainer_id == 0
_make_chekcpoint_dirs(checkpoint_dir)
serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
cur_dir = _get_serial_dir(checkpoint_dir, serial)
_save_trainer_args(cur_dir, trainer_id, trainer_args)
if is_chief:
_save_persist_vars_without_grad(executor, cur_dir, main_program)
if is_chief and lookup_table and pserver_endpoints:
_save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
pserver_endpoints)
_scroll_delete(checkpoint_dir, max_num_checkpoints)
def load_checkpoint(executor,
checkpoint_dir,
main_program,
role_id=0,
is_trainer=True,
load_trainer_args=None,
load_lookup_table=None):
"""
This function filters out all checkpoint variables from the give
main_program and then try to load these variables from the
`checkpoint_dir` directory.
In the training precess, we generally save a checkpoint in each
iteration. So there are more than one checkpoint in the
`checkpoint_dir` (each checkpoint has its own sub folder), use
`serial` to specify which serial of checkpoint you would like to
load.
A variable is a checkpoint variable and will be loaded if it meets
all following conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for loading checkpoint.
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
main_program(Program): The program whose checkpoint variables will
be loaded.
role_id(int): the trainer id or the parameter server id.
is_trainer(bool): trainer is True and parameter server is False.
load_trainer_args(list|None): list about load trainer args.
load_lookup_table(str|None): the lookup table name
Returns:
None
Raises:
ValueError: If `checkpoint_dir` is None.
ValueError: If `main_program` is None.
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
path = "./checkpoints"
prog = fluid.default_main_program()
load_checkpoint(executor=exe, checkpoint_dir=path,
serial=9, main_program=prog)
# In this example, `load_checkpoint` function
# will first filters out all checkpoint variables in the default
# main program, and then try to load these variables form the
# folder "./checkpoints/checkpoint_9/__model__".
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
serial = _get_latest_checkpoint_serial(checkpoint_dir)
# there are nothing need to be loaded
if serial is None or serial < 0:
return
if main_program is None:
raise ValueError('main_program should not be None.')
if is_trainer and load_trainer_args is None:
cur_dir = _get_serial_dir(checkpoint_dir, serial)
_load_persist_vars_without_grad(executor, cur_dir, main_program, True)
return
if is_trainer and load_trainer_args:
return _load_trainer_args(checkpoint_dir, serial, role_id,
load_trainer_args)
if not is_trainer and load_lookup_table:
_load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
load_lookup_table)
def clean_checkpoint(checkpoint_dir, delete_dir=False):
"""
clean the checkpoint dir, when the train exits normally,
the trainer will call clean_checkpoint to delete checkpoint directory saved before.
delete_dir only works when the directory is empty, otherwise, OSError is raised.
: param checkpoint_dir
: param delete_dir
"""
if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None")
_scroll_delete(checkpoint_dir, max_num_checkpoints=0)
if delete_dir and not os.listdir(checkpoint_dir):
os.rmdir(checkpoint_dir)
def _load_persist_vars_without_grad(executor,
dirname,
program,
has_model_dir=False):
"""
This function filters out all checkpoint variables from the give
program and then trys to load these variables from the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for loading variables.
dirname(str): The directory path.
program(Program): The program whose checkpoint variables will
be loaded.
has_model_dir(bool): if True, the function loads variables
from a sub directory named '__model__'.
Default: False
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
_load_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog, has_model_dir=True)
# In this example, `_load_persist_vars_without_grad` function
# will first filters out all checkpoint variables in the default
# main program, and then trys to load these variables form the
# folder "./my_paddle_model/__model__".
"""
if has_model_dir:
dirname = _get_model_dir(dirname)
io.load_vars(
executor,
dirname=dirname,
main_program=program,
predicate=_is_checkpoint_var,
filename=None)
def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
"""
The parameter server will load lookup table's local file in
selectedrows variable.
Args:
executor(Executor): The executor to run for loading persistable variables
dirname(str): The directory path
main_program(Program): Find the variable named table_name in main_program
pserver_id(int): the serial number in pserver_endpoints list
table_name(str): lookup table name
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
dirname = "./checkpoints/checkpoint_9/"
prog = fluid.default_main_program()
pserver_id = 1
table_name = "share_w"
_load_lookup_table_vars(executor=exe,
dirname=dirname, program=prog, pserver_id=pserver_id,
table_name=table_name)
"""
for var in program.list_vars():
if var.name == table_name:
lookup_table_var = var
break
assert lookup_table_var is not None
lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
load_prog = framework.Program()
load_block = load_prog.global_block()
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [lookup_table_var]},
attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
executor.run(load_prog)
def _save_persist_vars_without_grad(executor, dirname, program):
"""
This function filters out all checkpoint variables from the give
program and then save these variables to a sub-folder '__model__' of
the given directory.
A variable is a checkpoint variable if it meets all following
conditions:
1. It's persistable.
2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
Args:
executor(Executor): The executor to run for saving variables.
dirname(str): The directory path.
program(Program): The program whose checkpoint variables will
be saved.
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
_save_persist_vars_without_grad(executor=exe,
dirname=param_path, program=prog)
# In this example, `_save_persist_vars_without_grad` function
# will first filters out all checkpoint variables in the default
# main program, and then saves these variables to the folder
# "./my_paddle_model/__model__".
"""
cur_dir = _get_model_dir(dirname)
io.save_vars(
executor,
dirname=cur_dir,
main_program=program,
vars=None,
predicate=_is_checkpoint_var,
filename=None)
_write_success(cur_dir)
def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
ps_endpoint_list):
"""
This function will send checkpoint notify message from Trainer 0
to all the pservers.
The checkpoint notify message contains lookup table name,
the absolute path on pserver to save lookup_table.
Args:
executor(Executor): The executor to run for send checkpoint notify.
dirname(str): The folder where to save checkpoints.
lookup_table(string): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Return:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
_save_pserver_vars_by_notify(executor=exe,
dirname=param_path, lookup_table=table_name,
ps_endpoint_list=ps_endpoints)
"""
cur_dir = _get_lookuptable_dir(dirname)
checkpoint_notify_program = framework.Program()
checkpoint_notify_block = checkpoint_notify_program.global_block()
attrs = {}
attrs['epmap'] = ps_endpoint_list
attrs['dir'] = cur_dir
attrs['lookup_table'] = lookup_table
checkpoint_notify_block.append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
executor.run(checkpoint_notify_program)
def _save_trainer_args(dirname, trainer_id, trainer_args):
assert isinstance(trainer_args, dict)
cur_dir = _get_trainer_dir(dirname, trainer_id)
for name, value in trainer_args.iteritems():
args_file = os.path.join(cur_dir, name)
with open(args_file, 'w') as f:
f.write(str(value))
_write_success(cur_dir)
def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
"""
trainer will load some args from it's independent directory,
such as epoch_id and step_id.
Args:
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
trainer_id(int): current trainer id.
trainer_args(list): list about load trainer args
Return:
None
Examples:
.. code-block:: python
param_path = "./checkpoint/"
serial = 7
trainer_id = 2
trainer_args = ["epoch_id", "step_id"]
_load_trainer_args(checkpoint_dir=param_path, serial=serial,
trainer_id=trainer_id, trainer_args=trainer_args)
"""
assert isinstance(trainer_args, list)
cur_dir = _get_serial_dir(checkpoint_dir, serial)
cur_dir = _get_trainer_dir(cur_dir, trainer_id)
ret_values = []
for arg in trainer_args:
cur_file = os.path.join(cur_dir, arg)
with open(cur_file, 'r') as f:
contents = f.read()
ret_values.append(contents.strip())
return ret_values
def _is_checkpoint_var(var):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.RAW:
return False
# @GRAD are named for gradient variables, checkpoint will not save it.
if "@GRAD" in var.name:
return False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if ".trainer_" in var.name:
return False
# .block is named for distribute train variables, checkpoint will not save it.
if ".block" in var.name:
return False
return var.persistable
def _make_chekcpoint_dirs(dirs):
"""
_make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
"""
assert dirs is not None
if os.path.isfile(dirs):
raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
if not os.path.isdir(dirs):
try:
os.makedirs(dirs)
except OSError as err:
if err.errno != errno.EEXIST:
raise err
def _get_dir_serial(dirname):
_, serial = dirname.split(CHECKPOINT_SEPARATOR)
try:
serial_num = int(serial)
except ValueError:
serial_num = -1
return serial_num
def _get_serial_dir(dirname, serial):
serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
serial_dir = os.path.join(dirname, serial_folder)
_make_chekcpoint_dirs(serial_dir)
return serial_dir
def _get_model_dir(dirname):
model_dir = os.path.join(dirname, MODEL_DIR)
_make_chekcpoint_dirs(model_dir)
return model_dir
def _get_lookuptable_dir(dirname):
lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
_make_chekcpoint_dirs(lookuptable_dir)
return lookuptable_dir
def _get_trainer_dir(dirname, trainer_id):
trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
trainer_dir = os.path.join(dirname, trainer_folder)
_make_chekcpoint_dirs(trainer_dir)
return trainer_dir
def _scroll_delete(dirname, max_num_checkpoints=3):
dirs = os.listdir(dirname)
serial_map = {}
for serial in dirs:
serial_num = _get_dir_serial(serial)
serial_map[serial_num] = serial
if len(serial_map.keys()) <= max_num_checkpoints:
return
serials = serial_map.keys()
serials.sort(reverse=True)
serials = serials[max_num_checkpoints:]
for serial in serials:
cur_dir = _get_serial_dir(dirname, serial)
try:
shutil.rmtree(cur_dir)
except OSError as err:
if err.errno != errno.ENOENT:
raise err
def _write_success(dirname):
"""
write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
: param dirname
"""
success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
with open(success_file, 'a') as f:
now = time.ctime()
f.write(now)
def _get_latest_checkpoint_serial(checkpoint_dir):
"""
get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
: param checkpoint_dir
"""
if not checkpoint_dir:
return -1
def has_success(checkpoint_dir, cur_dir):
"""
is _SUCCESS in this dir
"""
serial = _get_dir_serial(cur_dir)
if serial == -1 or not os.path.isdir(
os.path.join(checkpoint_dir, cur_dir)):
return -1
success_path = os.path.join(
_get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
SUCCESS_MARK_FILENAME)
if os.path.isfile(success_path):
return serial
if not os.path.isdir(checkpoint_dir):
return -1
current_dir = -1
dirs = os.listdir(checkpoint_dir)
for cur_dir in dirs:
success_num = has_success(checkpoint_dir, cur_dir)
if success_num > current_dir:
current_dir = success_num
return current_dir
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
from inference_transpiler import InferenceTranspiler from inference_transpiler import InferenceTranspiler
from memory_optimization_transpiler import memory_optimize, release_memory from memory_optimization_transpiler import memory_optimize, release_memory
from ps_dispatcher import HashName, RoundRobin from ps_dispatcher import HashName, RoundRobin
__all__ = [ __all__ = [
"DistributeTranspiler", "InferenceTranspiler", "memory_optimize", "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
"release_memory", "HashName", "RoundRobin" "release_memory", "HashName", "RoundRobin", "DistributeTranspilerConfig"
] ]
...@@ -17,10 +17,10 @@ def delete_ops(block, ops): ...@@ -17,10 +17,10 @@ def delete_ops(block, ops):
try: try:
start = list(block.ops).index(ops[0]) start = list(block.ops).index(ops[0])
end = list(block.ops).index(ops[-1]) end = list(block.ops).index(ops[-1])
[block.remove_op(start) for _ in xrange(end - start + 1)] [block._remove_op(start) for _ in xrange(end - start + 1)]
except Exception, e: except Exception, e:
raise e raise e
block.program.sync_with_cpp() block.program._sync_with_cpp()
def find_op_by_input_arg(block, arg_name): def find_op_by_input_arg(block, arg_name):
......
...@@ -31,6 +31,7 @@ Steps to transpile pserver: ...@@ -31,6 +31,7 @@ Steps to transpile pserver:
from __future__ import print_function from __future__ import print_function
import math import math
import random
import numpy as np import numpy as np
from ps_dispatcher import RoundRobin, HashName, PSDispatcher from ps_dispatcher import RoundRobin, HashName, PSDispatcher
...@@ -63,7 +64,7 @@ def same_or_split_var(p_name, var_name): ...@@ -63,7 +64,7 @@ def same_or_split_var(p_name, var_name):
return p_name == var_name or p_name.startswith(var_name + ".block") return p_name == var_name or p_name.startswith(var_name + ".block")
def slice_variable(var_list, slice_count, min_block_size=8192): def slice_variable(var_list, slice_count, min_block_size):
""" """
We may need to split dense tensor to one or more blocks and put We may need to split dense tensor to one or more blocks and put
them equally onto parameter server. One block is a sub-tensor them equally onto parameter server. One block is a sub-tensor
...@@ -109,6 +110,22 @@ def slice_variable(var_list, slice_count, min_block_size=8192): ...@@ -109,6 +110,22 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
return blocks return blocks
class DistributeTranspilerConfig(object):
"""
slice_var_up (bool): Do Tensor slice for pservers, default is True.
split_method (PSDispatcher): RoundRobin or HashName can be used
try to choose the best method to balance loads for pservers.
min_block_size (int): Minimum splitted element number in block.
According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
We can use bandwidth effiently when data size is larger than 2MB.If you
want to change it, please be sure you see the slice_variable function.
"""
slice_var_up = True
split_method = None
min_block_size = 8192
class DistributeTranspiler(object): class DistributeTranspiler(object):
""" """
**DistributeTranspiler** **DistributeTranspiler**
...@@ -145,13 +162,23 @@ class DistributeTranspiler(object): ...@@ -145,13 +162,23 @@ class DistributeTranspiler(object):
trainer_program = t.get_trainer_program() trainer_program = t.get_trainer_program()
""" """
def __init__(self, config=None):
if config is not None:
self.config = config
else:
self.config = DistributeTranspilerConfig()
if self.config.split_method is None:
self.config.split_method = RoundRobin
assert (self.config.min_block_size >= 8192)
assert (self.config.split_method.__bases__[0] == PSDispatcher)
def transpile(self, def transpile(self,
trainer_id, trainer_id,
program=None, program=None,
pservers="127.0.0.1:6174", pservers="127.0.0.1:6174",
trainers=1, trainers=1,
slice_var_up=True,
split_method=RoundRobin,
sync_mode=True): sync_mode=True):
""" """
Run the transpiler. Run the transpiler.
...@@ -164,12 +191,8 @@ class DistributeTranspiler(object): ...@@ -164,12 +191,8 @@ class DistributeTranspiler(object):
pservers (str): comma separated ip:port string for the pserver pservers (str): comma separated ip:port string for the pserver
list. list.
trainers (int): number of trainers in the distributed job. trainers (int): number of trainers in the distributed job.
slice_var_up (bool): Do Tensor slice for pservers, default is True.
split_method (PSDispatcher): RoundRobin or HashName can be used
try to choose the best method to balance loads for pservers.
sync_mode (bool): Do sync training or not, default is True. sync_mode (bool): Do sync training or not, default is True.
""" """
assert (split_method.__bases__[0] == PSDispatcher)
if program is None: if program is None:
program = default_main_program() program = default_main_program()
self.origin_program = program self.origin_program = program
...@@ -180,11 +203,11 @@ class DistributeTranspiler(object): ...@@ -180,11 +203,11 @@ class DistributeTranspiler(object):
self.pserver_endpoints = pserver_endpoints self.pserver_endpoints = pserver_endpoints
self.optimize_ops, self.params_grads = self._get_optimize_pass() self.optimize_ops, self.params_grads = self._get_optimize_pass()
ps_dispatcher = split_method(self.pserver_endpoints) ps_dispatcher = self.config.split_method(self.pserver_endpoints)
self.has_distributed_lookup_table = self._has_distributed_lookup_table() self.has_distributed_lookup_table = self._has_distributed_lookup_table()
# split and create vars, then put splited vars in dicts for later use. # split and create vars, then put splited vars in dicts for later use.
self._init_splited_vars(slice_var_up) self._init_splited_vars()
# step 3.1: insert send op to send gradient vars to parameter servers # step 3.1: insert send op to send gradient vars to parameter servers
ps_dispatcher.reset() ps_dispatcher.reset()
...@@ -196,13 +219,14 @@ class DistributeTranspiler(object): ...@@ -196,13 +219,14 @@ class DistributeTranspiler(object):
# fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2 # fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
# shuffle the map will avoid the uneven distribution above # shuffle the map will avoid the uneven distribution above
grad_var_mapping_items = self.grad_var_mapping.items() grad_var_mapping_items = self.grad_var_mapping.items()
if not slice_var_up: if not self.config.slice_var_up:
np.random.shuffle(grad_var_mapping_items) random.seed(self.trainer_num)
random.shuffle(grad_var_mapping_items)
for orig_varname, splited_vars in grad_var_mapping_items: for orig_varname, splited_vars in grad_var_mapping_items:
eplist = ps_dispatcher.dispatch(splited_vars) eplist = ps_dispatcher.dispatch(splited_vars)
if not slice_var_up: if not self.config.slice_var_up:
assert (len(splited_vars) == 1) assert (len(splited_vars) == 1)
if len(splited_vars) == 1: if len(splited_vars) == 1:
...@@ -219,7 +243,7 @@ class DistributeTranspiler(object): ...@@ -219,7 +243,7 @@ class DistributeTranspiler(object):
AssertionError("Can not insert the send op by original " AssertionError("Can not insert the send op by original "
"variable name :", orig_varname) "variable name :", orig_varname)
program.global_block().insert_op( program.global_block()._insert_op(
index=index + 1, index=index + 1,
type="send", type="send",
inputs={"X": splited_vars}, inputs={"X": splited_vars},
...@@ -377,11 +401,6 @@ class DistributeTranspiler(object): ...@@ -377,11 +401,6 @@ class DistributeTranspiler(object):
# append it into the sub program. # append it into the sub program.
global_ops = [] global_ops = []
# HACK: optimization global ops only used to scale beta1 and beta2
# replace it with dependency engine.
for op in self.optimize_ops:
if self._is_adam_connected_op(op):
global_ops.append(op)
def __append_optimize_op__(op, block, grad_to_block_id, merged_var, def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
lr_ops): lr_ops):
...@@ -410,7 +429,7 @@ class DistributeTranspiler(object): ...@@ -410,7 +429,7 @@ class DistributeTranspiler(object):
# clone vars # clone vars
for var in origin_block.vars: for var in origin_block.vars:
new_sub_block.clone_variable(var) new_sub_block._clone_variable(var)
# clone ops # clone ops
for origin_op in origin_block.ops: for origin_op in origin_block.ops:
...@@ -442,6 +461,8 @@ class DistributeTranspiler(object): ...@@ -442,6 +461,8 @@ class DistributeTranspiler(object):
per_opt_block = pserver_program.create_block(pre_block_idx) per_opt_block = pserver_program.create_block(pre_block_idx)
optimize_blocks.append(per_opt_block) optimize_blocks.append(per_opt_block)
# append grad merging ops before clip and weight decay # append grad merging ops before clip and weight decay
# cases may like:
# L2Decay op -> clip op -> optimize
for _, op in enumerate(self.optimize_ops): for _, op in enumerate(self.optimize_ops):
# find the origin @GRAD var before clipping # find the origin @GRAD var before clipping
grad_varname_for_block = __op_have_grad_input__(op) grad_varname_for_block = __op_have_grad_input__(op)
...@@ -449,6 +470,7 @@ class DistributeTranspiler(object): ...@@ -449,6 +470,7 @@ class DistributeTranspiler(object):
merged_var = self._append_pserver_grad_merge_ops( merged_var = self._append_pserver_grad_merge_ops(
per_opt_block, grad_varname_for_block, endpoint, per_opt_block, grad_varname_for_block, endpoint,
grad_to_block_id, self.origin_program) grad_to_block_id, self.origin_program)
break # append optimize op once then append other ops.
for _, op in enumerate(self.optimize_ops): for _, op in enumerate(self.optimize_ops):
# optimizer is connected to itself # optimizer is connected to itself
if ufind.is_connected(op, opt_op) and op not in global_ops: if ufind.is_connected(op, opt_op) and op not in global_ops:
...@@ -503,7 +525,7 @@ class DistributeTranspiler(object): ...@@ -503,7 +525,7 @@ class DistributeTranspiler(object):
outputs={}, outputs={},
attrs=attrs) attrs=attrs)
pserver_program.sync_with_cpp() pserver_program._sync_with_cpp()
return pserver_program return pserver_program
def get_startup_program(self, endpoint, pserver_program): def get_startup_program(self, endpoint, pserver_program):
...@@ -535,7 +557,7 @@ class DistributeTranspiler(object): ...@@ -535,7 +557,7 @@ class DistributeTranspiler(object):
pserver_vars = pserver_program.global_block().vars pserver_vars = pserver_program.global_block().vars
created_var_map = dict() created_var_map = dict()
for _, var in pserver_vars.iteritems(): for _, var in pserver_vars.iteritems():
tmpvar = s_prog.global_block().clone_variable(var) tmpvar = s_prog.global_block()._clone_variable(var)
created_var_map[var.name] = tmpvar created_var_map[var.name] = tmpvar
# 2. rename op outputs # 2. rename op outputs
...@@ -630,7 +652,7 @@ class DistributeTranspiler(object): ...@@ -630,7 +652,7 @@ class DistributeTranspiler(object):
] ]
return param_list, grad_list return param_list, grad_list
def _init_splited_vars(self, slice_var_up): def _init_splited_vars(self):
# update these mappings for further transpile: # update these mappings for further transpile:
# 1. param_var_mapping: param var name -> [splited params vars] # 1. param_var_mapping: param var name -> [splited params vars]
# 2. grad_var_mapping: grad var name -> [splited grads vars] # 2. grad_var_mapping: grad var name -> [splited grads vars]
...@@ -654,17 +676,22 @@ class DistributeTranspiler(object): ...@@ -654,17 +676,22 @@ class DistributeTranspiler(object):
param_list, grad_list = self._update_dist_lookup_table_vars( param_list, grad_list = self._update_dist_lookup_table_vars(
param_list, grad_list, self.params_grads) param_list, grad_list, self.params_grads)
if slice_var_up: if self.config.slice_var_up:
# when we slice var up into blocks, we will slice the var according to # when we slice var up into blocks, we will slice the var according to
# pserver services' count. A pserver may have two or more listening ports. # pserver services' count. A pserver may have two or more listening ports.
grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints)) grad_blocks = slice_variable(grad_list,
len(self.pserver_endpoints),
self.config.min_block_size)
param_blocks = slice_variable(param_list, param_blocks = slice_variable(param_list,
len(self.pserver_endpoints)) len(self.pserver_endpoints),
self.config.min_block_size)
else: else:
# when we do NOT slice var up into blocks, we will always slice params # when we do NOT slice var up into blocks, we will always slice params
# grads into one block. # grads into one block.
grad_blocks = slice_variable(grad_list, 1) grad_blocks = slice_variable(grad_list, 1,
param_blocks = slice_variable(param_list, 1) self.config.min_block_size)
param_blocks = slice_variable(param_list, 1,
self.config.min_block_size)
assert (len(grad_blocks) == len(param_blocks)) assert (len(grad_blocks) == len(param_blocks))
# origin_varname -> [splited_var] # origin_varname -> [splited_var]
...@@ -733,7 +760,7 @@ class DistributeTranspiler(object): ...@@ -733,7 +760,7 @@ class DistributeTranspiler(object):
self.all_prefetch_output_vars.append(prefetch_output_vars) self.all_prefetch_output_vars.append(prefetch_output_vars)
# insert split_ids_op # insert split_ids_op
program.global_block().insert_op( program.global_block()._insert_op(
index=lookup_table_op_index, index=lookup_table_op_index,
type="split_ids", type="split_ids",
inputs={ inputs={
...@@ -745,7 +772,7 @@ class DistributeTranspiler(object): ...@@ -745,7 +772,7 @@ class DistributeTranspiler(object):
outputs={"Out": prefetch_input_vars}) outputs={"Out": prefetch_input_vars})
# insert prefetch_op # insert prefetch_op
program.global_block().insert_op( program.global_block()._insert_op(
index=lookup_table_op_index + 1, index=lookup_table_op_index + 1,
type="prefetch", type="prefetch",
inputs={'X': prefetch_input_vars}, inputs={'X': prefetch_input_vars},
...@@ -756,7 +783,7 @@ class DistributeTranspiler(object): ...@@ -756,7 +783,7 @@ class DistributeTranspiler(object):
}) })
# insert concat_op # insert concat_op
program.global_block().insert_op( program.global_block()._insert_op(
index=lookup_table_op_index + 2, index=lookup_table_op_index + 2,
type="merge_ids", type="merge_ids",
inputs={ inputs={
...@@ -787,14 +814,14 @@ class DistributeTranspiler(object): ...@@ -787,14 +814,14 @@ class DistributeTranspiler(object):
if table_grad_name in op.output_arg_names: if table_grad_name in op.output_arg_names:
op_index = list(all_ops).index(op) op_index = list(all_ops).index(op)
# insert split_ids_op # insert split_ids_op
program.global_block().insert_op( program.global_block()._insert_op(
index=op_index + 1, index=op_index + 1,
type="split_ids", type="split_ids",
inputs={ inputs={
'Ids': [program.global_block().vars[table_grad_name]] 'Ids': [program.global_block().vars[table_grad_name]]
}, },
outputs={"Out": self.trainer_side_table_grad_list}) outputs={"Out": self.trainer_side_table_grad_list})
program.global_block().insert_op( program.global_block()._insert_op(
index=op_index + 2, index=op_index + 2,
type="send", type="send",
inputs={'X': self.trainer_side_table_grad_list}, inputs={'X': self.trainer_side_table_grad_list},
...@@ -853,7 +880,7 @@ class DistributeTranspiler(object): ...@@ -853,7 +880,7 @@ class DistributeTranspiler(object):
persistable=True) persistable=True)
# parameter must be selected rows # parameter must be selected rows
param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS) param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
grad_var = pserver_program.global_block().clone_variable( grad_var = pserver_program.global_block()._clone_variable(
self.origin_program.global_block().vars[grad_var_name( self.origin_program.global_block().vars[grad_var_name(
self.table_name)]) self.table_name)])
...@@ -893,7 +920,7 @@ class DistributeTranspiler(object): ...@@ -893,7 +920,7 @@ class DistributeTranspiler(object):
if not splited_grad_name.startswith(origin_grad_name): if not splited_grad_name.startswith(origin_grad_name):
raise ValueError("origin_grad_var: " + splited_grad_name + raise ValueError("origin_grad_var: " + splited_grad_name +
" grad_var:" + grad_var.name) " grad_var:" + grad_var.name)
grad_var = pserver_program.global_block().rename_var( grad_var = pserver_program.global_block()._rename_var(
origin_grad_name, splited_grad_name) origin_grad_name, splited_grad_name)
lr_var = pserver_program.global_block().vars[table_opt_op.input( lr_var = pserver_program.global_block().vars[table_opt_op.input(
...@@ -969,7 +996,7 @@ class DistributeTranspiler(object): ...@@ -969,7 +996,7 @@ class DistributeTranspiler(object):
if self.sync_mode and add_trainer_suffix: if self.sync_mode and add_trainer_suffix:
new_var_name = "%s.trainer_%d" % \ new_var_name = "%s.trainer_%d" % \
(orig_var.name, self.trainer_id) (orig_var.name, self.trainer_id)
program.global_block().rename_var(varname, new_var_name) program.global_block()._rename_var(varname, new_var_name)
var_mapping[varname] = \ var_mapping[varname] = \
[program.global_block().var(new_var_name)] [program.global_block().var(new_var_name)]
else: else:
...@@ -1003,7 +1030,7 @@ class DistributeTranspiler(object): ...@@ -1003,7 +1030,7 @@ class DistributeTranspiler(object):
type=orig_var.type, type=orig_var.type,
shape=splited_shape) # flattend splited var shape=splited_shape) # flattend splited var
var_mapping[varname].append(var) var_mapping[varname].append(var)
program.global_block().sync_with_cpp() program.global_block()._sync_with_cpp()
return var_mapping return var_mapping
def create_splited_vars(self, source_var, block, tag): def create_splited_vars(self, source_var, block, tag):
...@@ -1031,7 +1058,7 @@ class DistributeTranspiler(object): ...@@ -1031,7 +1058,7 @@ class DistributeTranspiler(object):
height_sections = [] height_sections = []
for v in splited_vars: for v in splited_vars:
height_sections.append(v.shape[0]) height_sections.append(v.shape[0])
program.global_block().insert_op( program.global_block()._insert_op(
index=index + 1, index=index + 1,
type="split_selected_rows", type="split_selected_rows",
inputs={"X": orig_var}, inputs={"X": orig_var},
...@@ -1041,7 +1068,7 @@ class DistributeTranspiler(object): ...@@ -1041,7 +1068,7 @@ class DistributeTranspiler(object):
sections = [] sections = []
for v in splited_vars: for v in splited_vars:
sections.append(v.shape[0]) sections.append(v.shape[0])
program.global_block().insert_op( program.global_block()._insert_op(
index=index + 1, index=index + 1,
type="split_byref", type="split_byref",
inputs={"X": orig_var}, inputs={"X": orig_var},
...@@ -1230,7 +1257,7 @@ class DistributeTranspiler(object): ...@@ -1230,7 +1257,7 @@ class DistributeTranspiler(object):
varlist = [varlist] varlist = [varlist]
for var in varlist: for var in varlist:
if var not in program.global_block().vars: if var not in program.global_block().vars:
block.clone_variable(var) block._clone_variable(var)
outputs = self._get_output_map_from_op( outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, op) self.origin_program.global_block().vars, op)
...@@ -1239,7 +1266,7 @@ class DistributeTranspiler(object): ...@@ -1239,7 +1266,7 @@ class DistributeTranspiler(object):
varlist = [varlist] varlist = [varlist]
for var in varlist: for var in varlist:
if var not in program.global_block().vars: if var not in program.global_block().vars:
block.clone_variable(var) block._clone_variable(var)
return block.append_op( return block.append_op(
type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs) type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
...@@ -1277,7 +1304,7 @@ class DistributeTranspiler(object): ...@@ -1277,7 +1304,7 @@ class DistributeTranspiler(object):
if grad_block: if grad_block:
outputs[key] = grad_block outputs[key] = grad_block
elif not program.global_block().vars.has_key(var.name): elif not program.global_block().vars.has_key(var.name):
program.global_block().clone_variable(var) program.global_block()._clone_variable(var)
return optimize_block.append_op( return optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
...@@ -1289,26 +1316,8 @@ class DistributeTranspiler(object): ...@@ -1289,26 +1316,8 @@ class DistributeTranspiler(object):
# If one op's input is another op's output or # If one op's input is another op's output or
# one op's output is another op's input, we say # one op's output is another op's input, we say
# the two operator is connected. # the two operator is connected.
def _append_inname_remove_beta(varname_list): if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \
op_input_names = [] set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()):
for in_name in varname_list:
# HACK: remove beta1 and beta2 to avoid let all
# ops connected.
if in_name.startswith("beta2_pow_acc") or \
in_name.startswith("beta1_pow_acc"):
continue
else:
op_input_names.append(in_name)
return op_input_names
op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
op1_output_names = op1.desc.output_arg_names()
op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
op2_output_names = op2.desc.output_arg_names()
if set(op1_output_names) & set(op2_input_names) or \
set(op1_input_names) & set(op2_output_names):
return True return True
return False return False
...@@ -1413,7 +1422,7 @@ class DistributeTranspiler(object): ...@@ -1413,7 +1422,7 @@ class DistributeTranspiler(object):
def _get_optimize_pass(self): def _get_optimize_pass(self):
""" """
Get optimizer operators, paramters and gradients from origin_program Get optimizer operators, parameters and gradients from origin_program
Returns: Returns:
opt_ops (list): optimize operators. opt_ops (list): optimize operators.
params_grads (dict): paramter->gradient. params_grads (dict): paramter->gradient.
...@@ -1436,20 +1445,6 @@ class DistributeTranspiler(object): ...@@ -1436,20 +1445,6 @@ class DistributeTranspiler(object):
origin_var_dict[param_name], origin_var_dict[param_name],
origin_var_dict[input_name] origin_var_dict[input_name]
]) ])
elif self._is_adam_connected_op(op):
opt_ops.append(op)
else: else:
pass pass
return opt_ops, params_grads return opt_ops, params_grads
def _is_adam_connected_op(self, op):
"""
A hack function to determinate whether the input operator
is connected to optimize operator.
"""
if op.type == "scale":
for in_name in op.input_arg_names:
if in_name.startswith("beta1_pow_acc") or \
in_name.startswith("beta2_pow_acc"):
return True
return False
...@@ -95,7 +95,7 @@ class InferenceTranspiler(object): ...@@ -95,7 +95,7 @@ class InferenceTranspiler(object):
# modify bnorm OP to include relu # modify bnorm OP to include relu
current_op.set_attr("fuse_with_relu", True) current_op.set_attr("fuse_with_relu", True)
# remove relu OP # remove relu OP
self.block.remove_op(i + 1) self.block._remove_op(i + 1)
i = i + 1 i = i + 1
self._remove_unused_var() self._remove_unused_var()
...@@ -171,7 +171,7 @@ class InferenceTranspiler(object): ...@@ -171,7 +171,7 @@ class InferenceTranspiler(object):
# fuse batch_norm # fuse batch_norm
self._fuse_param(current_op, next_op, bias_op, 0) self._fuse_param(current_op, next_op, bias_op, 0)
# remove batch_norm_op # remove batch_norm_op
self.block.remove_op(i + 2) self.block._remove_op(i + 2)
i = i + 1 i = i + 1
# conv2d with bias, the next_op.type is elementwise_add # conv2d with bias, the next_op.type is elementwise_add
elif (next_op.type == 'elementwise_add'): elif (next_op.type == 'elementwise_add'):
...@@ -180,7 +180,7 @@ class InferenceTranspiler(object): ...@@ -180,7 +180,7 @@ class InferenceTranspiler(object):
# fuse batch_norm # fuse batch_norm
self._fuse_param(current_op, next_next_op, next_op, 1) self._fuse_param(current_op, next_next_op, next_op, 1)
# remove batch_norm_op # remove batch_norm_op
self.block.remove_op(i + 2) self.block._remove_op(i + 2)
i = i + 1 i = i + 1
i = i + 1 i = i + 1
...@@ -212,7 +212,7 @@ class InferenceTranspiler(object): ...@@ -212,7 +212,7 @@ class InferenceTranspiler(object):
y_var = self.block.var(bn_op.input("Bias")[0]) y_var = self.block.var(bn_op.input("Bias")[0])
out_var = self.block.var(bn_op.output("Y")[0]) out_var = self.block.var(bn_op.output("Y")[0])
bias_op = self.block.insert_op( bias_op = self.block._insert_op(
index, index,
type="elementwise_add", type="elementwise_add",
inputs={"X": x_var, inputs={"X": x_var,
...@@ -307,4 +307,4 @@ class InferenceTranspiler(object): ...@@ -307,4 +307,4 @@ class InferenceTranspiler(object):
for var in self.block.vars.keys(): for var in self.block.vars.keys():
if var not in args: if var not in args:
self.block.remove_var(var) self.block._remove_var(var)
...@@ -177,7 +177,7 @@ class ControlFlowGraph(object): ...@@ -177,7 +177,7 @@ class ControlFlowGraph(object):
in_diff) in_diff)
if can_optimize: if can_optimize:
index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1 index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
delete_op = block_desc.insert_op(index) delete_op = block_desc._insert_op(index)
delete_op.set_type("delete_var") delete_op.set_type("delete_var")
delete_op.set_input("X", can_optimize) delete_op.set_input("X", can_optimize)
if is_forward: if is_forward:
......
...@@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator: ...@@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator:
TODO(yuyang18): Should we add whole design doc here? TODO(yuyang18): Should we add whole design doc here?
""" """
import decorator import paddle.reader.decorator
from decorator import * from paddle.reader.decorator import *
import creator import paddle.reader.creator
__all__ = decorator.__all__ + ['creator'] __all__ = decorator.__all__ + ['creator']
...@@ -20,7 +20,7 @@ __all__ = [ ...@@ -20,7 +20,7 @@ __all__ = [
from threading import Thread from threading import Thread
import subprocess import subprocess
from Queue import Queue from six.moves.queue import Queue
import itertools import itertools
import random import random
import zlib import zlib
......
...@@ -182,7 +182,7 @@ def resize_short(im, size): ...@@ -182,7 +182,7 @@ def resize_short(im, size):
h_new = size * h / w h_new = size * h / w
else: else:
w_new = size * w / h w_new = size * w / h
im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
return im return im
...@@ -324,7 +324,6 @@ def simple_transform(im, ...@@ -324,7 +324,6 @@ def simple_transform(im,
if np.random.randint(2) == 0: if np.random.randint(2) == 0:
im = left_right_flip(im, is_color) im = left_right_flip(im, is_color)
else: else:
im = center_crop(im, crop_size, is_color)
im = center_crop(im, crop_size, is_color=is_color) im = center_crop(im, crop_size, is_color=is_color)
if len(im.shape) == 3: if len(im.shape) == 3:
im = to_chw(im) im = to_chw(im)
......
...@@ -8,4 +8,4 @@ scipy>=0.19.0 ...@@ -8,4 +8,4 @@ scipy>=0.19.0
Pillow Pillow
nltk>=3.2.2 nltk>=3.2.2
graphviz graphviz
LinkChecker six
...@@ -17,7 +17,8 @@ def git_commit(): ...@@ -17,7 +17,8 @@ def git_commit():
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
except: except:
git_commit = 'Unknown' git_commit = 'Unknown'
return git_commit git_commit = git_commit.decode()
return str(git_commit)
def _get_version_detail(idx): def _get_version_detail(idx):
assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \ assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
...@@ -42,12 +43,13 @@ def get_patch(): ...@@ -42,12 +43,13 @@ def get_patch():
def is_taged(): def is_taged():
try: try:
cmd = ['git', 'describe', '--exact-match', '--tags'] cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
git_tag = git_tag.decode()
except: except:
return False return False
if git_tag.replace('v', '') == '@PADDLE_VERSION@': if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
return True return True
else: else:
return False return False
...@@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s' ...@@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s'
def show(): def show():
if istaged: if istaged:
print 'full_version:', full_version print('full_version:', full_version)
print 'major:', major print('major:', major)
print 'minor:', minor print('minor:', minor)
print 'patch:', patch print('patch:', patch)
print 'rc:', rc print('rc:', rc)
else: else:
print 'commit:', commit print('commit:', commit)
def mkl(): def mkl():
return with_mkl return with_mkl
...@@ -181,6 +183,14 @@ else: ...@@ -181,6 +183,14 @@ else:
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
if os.system(command) != 0: if os.system(command) != 0:
raise Exception("patch core.so failed, command: %s" % command) raise Exception("patch core.so failed, command: %s" % command)
if '${WITH_FLUID_ONLY}'== 'OFF':
# change rpath of _swig_paddle.so.
if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
else:
command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
if os.system(command) != 0:
raise Exception("patch _swig_paddle.so failed, command: %s" % command)
setup(name='${PACKAGE_NAME}', setup(name='${PACKAGE_NAME}',
version='${PADDLE_VERSION}', version='${PADDLE_VERSION}',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import json
def check_approval(count, required_reviewers):
json_buff = ""
for line in sys.stdin:
json_buff = "".join([json_buff, line])
json_resp = json.loads(json_buff)
approves = 0
approved_user_ids = []
for review in json_resp:
if review["state"] == "APPROVED":
approves += 1
approved_user_ids.append(review["user"]["id"])
# convert to int
required_reviewers_int = set()
for rr in required_reviewers:
required_reviewers_int.add(int(rr))
if len(set(approved_user_ids) & required_reviewers_int) >= count:
print("TRUE")
else:
print("FALSE")
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1].isdigit():
check_approval(int(sys.argv[1]), sys.argv[2:])
else:
print(
"Usage: python check_pr_approval.py [count] [required reviewer id] ..."
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册