提交 9386ac0a 编写于 作者: Y Yibing Liu

Enhance cuda code & unittest for argsort_op

...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......
...@@ -122,5 +122,9 @@ def parse_args(): ...@@ -122,5 +122,9 @@ def parse_args():
type=str, type=str,
default="", default="",
help='Directory that contains all the training recordio files.') help='Directory that contains all the training recordio files.')
parser.add_argument(
'--use_inference_transpiler',
action='store_true',
help='If set, uses inference transpiler to optimize the program.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, ...@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
# Use inference_transpiler to speedup
if args.use_inference_transpiler:
t = fluid.InferenceTranspiler()
t.transpile(infer_prog, place)
if not args.use_reader_op: if not args.use_reader_op:
feed_var_list = [ feed_var_list = [
var for var in train_prog.global_block().vars.itervalues() var for var in train_prog.global_block().vars.itervalues()
......
...@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir) ...@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
endforeach() endforeach()
endfunction() endfunction()
# download library if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}") # download library
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}") execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}") execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
endif()
if (WITH_ANAKIN) if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin for inference is enabled")
......
...@@ -40,12 +40,12 @@ ExternalProject_Add( ...@@ -40,12 +40,12 @@ ExternalProject_Add(
# NOTE(wuyi): # NOTE(wuyi):
# this package is generated by following steps: # this package is generated by following steps:
# 1. git clone -b v1.8.x https://github.com/grpc/grpc.git # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
# 2. submodule update --init # 2. git submodule update --init
# 3. keep only zlib, cares, protobuf, boringssl under "third_party", # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party # checkout and clean other dirs under third_party
# 4. remove .git, and package the directory. # 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz" URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0" URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR} PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
......
...@@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) ...@@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile}) ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
TARGET_LINK_LIBRARIES(cblas dynload_mklml)
ELSE()
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
......
...@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID) ...@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID) endif(NOT APPLE AND NOT ANDROID)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
# for building inference libs
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
function(merge_static_libs TARGET_NAME) function(merge_static_libs TARGET_NAME)
set(libs ${ARGN}) set(libs ${ARGN})
list(REMOVE_DUPLICATES libs) list(REMOVE_DUPLICATES libs)
...@@ -195,6 +209,15 @@ function(cc_library TARGET_NAME) ...@@ -195,6 +209,15 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc) list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc) add_dependencies(${TARGET_NAME} warpctc)
endif() endif()
# Only deps libmklml.so, not link
if("${cc_library_DEPS};" MATCHES "mklml;")
list(REMOVE_ITEM cc_library_DEPS mklml)
if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
list(APPEND cc_library_DEPS dynload_mklml)
endif()
add_dependencies(${TARGET_NAME} mklml)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif() endif()
...@@ -241,6 +264,7 @@ function(cc_test TARGET_NAME) ...@@ -241,6 +264,7 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL}) if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif() endif()
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -305,6 +329,7 @@ function(nv_test TARGET_NAME) ...@@ -305,6 +329,7 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL) if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
endif() endif()
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -552,7 +577,7 @@ function(py_test TARGET_NAME) ...@@ -552,7 +577,7 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
......
...@@ -12,19 +12,6 @@ ...@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library # make package for paddle fluid shared and static library
function(copy TARGET) function(copy TARGET)
set(options "") set(options "")
...@@ -149,21 +136,33 @@ copy(memory_lib ...@@ -149,21 +136,33 @@ copy(memory_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
) )
set(module "inference") set(inference_deps paddle_fluid_shared paddle_fluid)
copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
if(WITH_CONTRIB) if(WITH_CONTRIB)
set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference") message(STATUS "installing contrib")
copy(contrib_inference_lib DEPS paddle_inference_api set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
if (WITH_ANAKIN)
copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
list(APPEND inference_deps contrib_anakin_inference_lib)
endif()
copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.* ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
DSTS ${contrib_dst_dir} ${contrib_dst_dir} DSTS ${contrib_dst_dir} ${contrib_dst_dir})
) list(APPEND inference_deps contrib_inference_lib)
endif() endif()
set(module "inference")
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform") set(module "platform")
copy(platform_lib DEPS profiler_py_proto copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
......
=========
关于我们
=========
什么是PaddlePaddle
--------------------
- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法
- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验
- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具
PaddlePaddle的技术特色
-------------------------
- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型
- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练
- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程
提供基于PaddlePaddle的教育体系
--------------------------------
- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习
- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持
- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
提供基于PaddlePaddle的AI服务
------------------------------
- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型
- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务
- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案
你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
-----------------------------------------------------------
- 学习/使用问题:可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_,以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
- 对PaddlePaddle框架发展的建议:可发送邮件至Paddle-better@baidu.com
我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步
PaddlePaddle团队
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
========= =============
evaluator fluid.average
========= =============
.. _api_fluid_average_WeightedAverage:
WeightedAverage
---------------
.. autoclass:: paddle.fluid.average.WeightedAverage
:members:
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==============
fluid.backward
==============
.. _api_fluid_backward_append_backward:
append_backward
---------------
.. autofunction:: paddle.fluid.backward.append_backward
:noindex:
.. _api_fluid_backward_calc_gradient:
calc_gradient
-------------
.. autofunction:: paddle.fluid.backward.calc_gradient
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
==== ==========
clip fluid.clip
==== ==========
.. _api_fluid_clip_ErrorClipByValue:
ErrorClipByValue ErrorClipByValue
---------------- ----------------
...@@ -12,6 +14,8 @@ ErrorClipByValue ...@@ -12,6 +14,8 @@ ErrorClipByValue
:members: :members:
:noindex: :noindex:
.. _api_fluid_clip_GradientClipByValue:
GradientClipByValue GradientClipByValue
------------------- -------------------
...@@ -19,6 +23,8 @@ GradientClipByValue ...@@ -19,6 +23,8 @@ GradientClipByValue
:members: :members:
:noindex: :noindex:
.. _api_fluid_clip_GradientClipByNorm:
GradientClipByNorm GradientClipByNorm
------------------ ------------------
...@@ -26,6 +32,8 @@ GradientClipByNorm ...@@ -26,6 +32,8 @@ GradientClipByNorm
:members: :members:
:noindex: :noindex:
.. _api_fluid_clip_GradientClipByGlobalNorm:
GradientClipByGlobalNorm GradientClipByGlobalNorm
------------------------ ------------------------
...@@ -33,15 +41,3 @@ GradientClipByGlobalNorm ...@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
:members: :members:
:noindex: :noindex:
append_gradient_clip_ops
------------------------
.. autofunction:: paddle.fluid.clip.append_gradient_clip_ops
:noindex:
error_clip_callback
-------------------
.. autofunction:: paddle.fluid.clip.error_clip_callback
:noindex:
==================================
Data Reader Interface and DataSets
==================================
.. toctree::
:maxdepth: 1
data/data_reader.rst
data/image.rst
data/dataset.rst
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
=========== =================
data_feeder fluid.data_feeder
=========== =================
.. _api_fluid_data_feeder_DataFeeder:
DataFeeder DataFeeder
---------- ----------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
======== ==============
executor fluid.executor
======== ==============
.. _api_fluid_executor_Executor:
Executor Executor
-------- --------
...@@ -12,24 +14,32 @@ Executor ...@@ -12,24 +14,32 @@ Executor
:members: :members:
:noindex: :noindex:
.. _api_fluid_executor_global_scope:
global_scope global_scope
------------ ------------
.. autofunction:: paddle.fluid.executor.global_scope .. autofunction:: paddle.fluid.executor.global_scope
:noindex: :noindex:
.. _api_fluid_executor_scope_guard:
scope_guard scope_guard
----------- -----------
.. autofunction:: paddle.fluid.executor.scope_guard .. autofunction:: paddle.fluid.executor.scope_guard
:noindex: :noindex:
switch_scope .. _api_fluid_executor__switch_scope:
------------
_switch_scope
-------------
.. autofunction:: paddle.fluid.executor.switch_scope .. autofunction:: paddle.fluid.executor._switch_scope
:noindex: :noindex:
.. _api_fluid_executor_fetch_var:
fetch_var fetch_var
--------- ---------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=====
fluid
=====
.. _api_fluid_Block:
Block
-----
.. autoclass:: paddle.fluid.Block
:members:
:noindex:
.. _api_fluid_Variable:
Variable
--------
.. autoclass:: paddle.fluid.Variable
:members:
:noindex:
.. _api_fluid_Program:
Program
-------
.. autoclass:: paddle.fluid.Program
:members:
:noindex:
.. _api_fluid_Operator:
Operator
--------
.. autoclass:: paddle.fluid.Operator
:members:
:noindex:
.. _api_fluid_default_startup_program:
default_startup_program
-----------------------
.. autofunction:: paddle.fluid.default_startup_program
:noindex:
.. _api_fluid_default_main_program:
default_main_program
--------------------
.. autofunction:: paddle.fluid.default_main_program
:noindex:
.. _api_fluid_program_guard:
program_guard
-------------
.. autofunction:: paddle.fluid.program_guard
:noindex:
.. _api_fluid_get_var:
get_var
-------
.. autofunction:: paddle.fluid.get_var
:noindex:
.. _api_fluid_Executor:
Executor
--------
.. autoclass:: paddle.fluid.Executor
:members:
:noindex:
.. _api_fluid_global_scope:
global_scope
------------
.. autofunction:: paddle.fluid.global_scope
:noindex:
.. _api_fluid_scope_guard:
scope_guard
-----------
.. autofunction:: paddle.fluid.scope_guard
:noindex:
.. _api_fluid__switch_scope:
_switch_scope
-------------
.. autofunction:: paddle.fluid._switch_scope
:noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel:
make_channel
------------
.. autofunction:: paddle.fluid.make_channel
:noindex:
.. _api_fluid_channel_send:
channel_send
------------
.. autofunction:: paddle.fluid.channel_send
:noindex:
.. _api_fluid_channel_recv:
channel_recv
------------
.. autofunction:: paddle.fluid.channel_recv
:noindex:
.. _api_fluid_channel_close:
channel_close
-------------
.. autofunction:: paddle.fluid.channel_close
:noindex:
.. _api_fluid_Select:
Select
------
.. autoclass:: paddle.fluid.Select
:members:
:noindex:
.. _api_fluid_Trainer:
Trainer
-------
.. autoclass:: paddle.fluid.Trainer
:members:
:noindex:
.. _api_fluid_BeginEpochEvent:
BeginEpochEvent
---------------
.. autoclass:: paddle.fluid.BeginEpochEvent
:members:
:noindex:
.. _api_fluid_EndEpochEvent:
EndEpochEvent
-------------
.. autoclass:: paddle.fluid.EndEpochEvent
:members:
:noindex:
.. _api_fluid_BeginStepEvent:
BeginStepEvent
--------------
.. autoclass:: paddle.fluid.BeginStepEvent
:members:
:noindex:
.. _api_fluid_EndStepEvent:
EndStepEvent
------------
.. autoclass:: paddle.fluid.EndStepEvent
:members:
:noindex:
.. _api_fluid_CheckpointConfig:
CheckpointConfig
----------------
.. autoclass:: paddle.fluid.CheckpointConfig
:members:
:noindex:
.. _api_fluid_Inferencer:
Inferencer
----------
.. autoclass:: paddle.fluid.Inferencer
:members:
:noindex:
.. _api_fluid_DistributeTranspiler:
DistributeTranspiler
--------------------
.. autoclass:: paddle.fluid.DistributeTranspiler
:members:
:noindex:
.. _api_fluid_memory_optimize:
memory_optimize
---------------
.. autofunction:: paddle.fluid.memory_optimize
:noindex:
.. _api_fluid_release_memory:
release_memory
--------------
.. autofunction:: paddle.fluid.release_memory
:noindex:
.. _api_fluid_ParallelExecutor:
ParallelExecutor
----------------
.. autoclass:: paddle.fluid.ParallelExecutor
:members:
:noindex:
.. _api_fluid_ExecutionStrategy:
ExecutionStrategy
-----------------
.. autoclass:: paddle.fluid.ExecutionStrategy
:members:
:noindex:
.. _api_fluid_BuildStrategy:
BuildStrategy
-------------
.. autoclass:: paddle.fluid.BuildStrategy
:members:
:noindex:
.. _api_fluid_create_lod_tensor:
create_lod_tensor
-----------------
.. autofunction:: paddle.fluid.create_lod_tensor
:noindex:
.. _api_fluid_create_random_int_lodtensor:
create_random_int_lodtensor
---------------------------
.. autofunction:: paddle.fluid.create_random_int_lodtensor
:noindex:
.. _api_fluid_LoDTensor:
LoDTensor
---------
.. autoclass:: paddle.fluid.LoDTensor
:members:
:noindex:
.. _api_fluid_CPUPlace:
CPUPlace
--------
.. autoclass:: paddle.fluid.CPUPlace
:members:
:noindex:
.. _api_fluid_CUDAPlace:
CUDAPlace
---------
.. autoclass:: paddle.fluid.CUDAPlace
:members:
:noindex:
.. _api_fluid_CUDAPinnedPlace:
CUDAPinnedPlace
---------------
.. autoclass:: paddle.fluid.CUDAPinnedPlace
:members:
:noindex:
.. _api_fluid_Tensor:
Tensor
------
.. autoclass:: paddle.fluid.Tensor
:members:
:noindex:
.. _api_fluid_ParamAttr:
ParamAttr
---------
.. autoclass:: paddle.fluid.ParamAttr
:members:
:noindex:
.. _api_fluid_WeightNormParamAttr:
WeightNormParamAttr
-------------------
.. autoclass:: paddle.fluid.WeightNormParamAttr
:members:
:noindex:
.. _api_fluid_DataFeeder:
DataFeeder
----------
.. autoclass:: paddle.fluid.DataFeeder
:members:
:noindex:
.. _api_fluid_Scope:
Scope
-----
.. autoclass:: paddle.fluid.Scope
:members:
:noindex:
...@@ -29,19 +29,27 @@ def parse_arg(): ...@@ -29,19 +29,27 @@ def parse_arg():
class DocGenerator(object): class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout): def __init__(self, module_name=None, stream=sys.stdout):
if module_name == "":
module_name = None
self.stream = stream self.stream = stream
self.module_name = module_name if module_name is None:
if not hasattr(fluid, module_name): self.module_name = "fluid"
raise ValueError("Cannot find fluid.{0}".format(module_name))
else: else:
self.module = getattr(fluid, module_name) self.module_name = "fluid." + module_name
if module_name is None:
self.module = fluid
else:
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
''') ''')
self._print_header_(module_name, dot='=', is_title=True) self._print_header_(self.module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name): def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name) submodule = getattr(self.module, submodule_name)
...@@ -60,25 +68,29 @@ class DocGenerator(object): ...@@ -60,25 +68,29 @@ class DocGenerator(object):
self._print_header_(name, dot='=', is_title=False) self._print_header_(name, dot='=', is_title=False)
def print_item(self, name): def print_item(self, name):
item = getattr(self.module, name) item = getattr(self.module, name, None)
if item is None:
return
if isinstance(item, types.TypeType): if isinstance(item, types.TypeType):
self.print_class(name) self.print_class(name)
elif isinstance(item, types.FunctionType): elif isinstance(item, types.FunctionType):
self.print_method(name) self.print_method(name)
else: else:
raise RuntimeError("Unsupported item {0}".format(name)) pass
def print_class(self, name): def print_class(self, name):
self._print_ref_(name)
self._print_header_(name, dot='-', is_title=False) self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.fluid.{0}.{1} self.stream.write('''.. autoclass:: paddle.{0}.{1}
:members: :members:
:noindex: :noindex:
'''.format(self.module_name, name)) '''.format(self.module_name, name))
def print_method(self, name): def print_method(self, name):
self._print_ref_(name)
self._print_header_(name, dot='-', is_title=False) self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.fluid.{0}.{1} self.stream.write('''.. autofunction:: paddle.{0}.{1}
:noindex: :noindex:
'''.format(self.module_name, name)) '''.format(self.module_name, name))
...@@ -94,6 +106,10 @@ class DocGenerator(object): ...@@ -94,6 +106,10 @@ class DocGenerator(object):
self.stream.write('\n') self.stream.write('\n')
self.stream.write('\n') self.stream.write('\n')
def _print_ref_(self, name):
self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
self.module_name.split(".")), name))
def main(): def main():
args = parse_arg() args = parse_arg()
......
#!/bin/bash #!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
do do
python gen_doc.py ${module} > ${module}.rst python gen_doc.py ${module} > ${module}.rst
done done
python gen_doc.py "" > fluid.rst
====================== =============
Fluid API Reference
====================== =============
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
fluid.rst
layers.rst layers.rst
data_feeder.rst data_feeder.rst
executor.rst executor.rst
...@@ -18,3 +19,8 @@ Fluid ...@@ -18,3 +19,8 @@ Fluid
regularizer.rst regularizer.rst
io.rst io.rst
data.rst data.rst
transpiler.rst
recordio_writer.rst
backward.rst
average.rst
profiler.rst
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
=========== =================
initializer fluid.initializer
=========== =================
.. _api_fluid_initializer_Constant:
Constant Constant
-------- --------
...@@ -12,6 +14,8 @@ Constant ...@@ -12,6 +14,8 @@ Constant
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_Uniform:
Uniform Uniform
------- -------
...@@ -19,6 +23,8 @@ Uniform ...@@ -19,6 +23,8 @@ Uniform
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_Normal:
Normal Normal
------ ------
...@@ -26,6 +32,8 @@ Normal ...@@ -26,6 +32,8 @@ Normal
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_Xavier:
Xavier Xavier
------ ------
...@@ -33,6 +41,8 @@ Xavier ...@@ -33,6 +41,8 @@ Xavier
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_Bilinear:
Bilinear Bilinear
-------- --------
...@@ -40,18 +50,33 @@ Bilinear ...@@ -40,18 +50,33 @@ Bilinear
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_MSRA:
MSRA
----
.. autoclass:: paddle.fluid.initializer.MSRA
:members:
:noindex:
.. _api_fluid_initializer_force_init_on_cpu:
force_init_on_cpu force_init_on_cpu
----------------- -----------------
.. autofunction:: paddle.fluid.initializer.force_init_on_cpu .. autofunction:: paddle.fluid.initializer.force_init_on_cpu
:noindex: :noindex:
.. _api_fluid_initializer_init_on_cpu:
init_on_cpu init_on_cpu
----------- -----------
.. autofunction:: paddle.fluid.initializer.init_on_cpu .. autofunction:: paddle.fluid.initializer.init_on_cpu
:noindex: :noindex:
.. _api_fluid_initializer_ConstantInitializer:
ConstantInitializer ConstantInitializer
------------------- -------------------
...@@ -59,6 +84,8 @@ ConstantInitializer ...@@ -59,6 +84,8 @@ ConstantInitializer
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_UniformInitializer:
UniformInitializer UniformInitializer
------------------ ------------------
...@@ -66,6 +93,8 @@ UniformInitializer ...@@ -66,6 +93,8 @@ UniformInitializer
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_NormalInitializer:
NormalInitializer NormalInitializer
----------------- -----------------
...@@ -73,6 +102,8 @@ NormalInitializer ...@@ -73,6 +102,8 @@ NormalInitializer
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_XavierInitializer:
XavierInitializer XavierInitializer
----------------- -----------------
...@@ -80,6 +111,8 @@ XavierInitializer ...@@ -80,6 +111,8 @@ XavierInitializer
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_BilinearInitializer:
BilinearInitializer BilinearInitializer
------------------- -------------------
...@@ -87,3 +120,12 @@ BilinearInitializer ...@@ -87,3 +120,12 @@ BilinearInitializer
:members: :members:
:noindex: :noindex:
.. _api_fluid_initializer_MSRAInitializer:
MSRAInitializer
---------------
.. autoclass:: paddle.fluid.initializer.MSRAInitializer
:members:
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
== ========
io fluid.io
== ========
.. _api_fluid_io_save_vars:
save_vars save_vars
--------- ---------
...@@ -11,84 +13,112 @@ save_vars ...@@ -11,84 +13,112 @@ save_vars
.. autofunction:: paddle.fluid.io.save_vars .. autofunction:: paddle.fluid.io.save_vars
:noindex: :noindex:
.. _api_fluid_io_save_params:
save_params save_params
----------- -----------
.. autofunction:: paddle.fluid.io.save_params .. autofunction:: paddle.fluid.io.save_params
:noindex: :noindex:
.. _api_fluid_io_save_persistables:
save_persistables save_persistables
----------------- -----------------
.. autofunction:: paddle.fluid.io.save_persistables .. autofunction:: paddle.fluid.io.save_persistables
:noindex: :noindex:
.. _api_fluid_io_load_vars:
load_vars load_vars
--------- ---------
.. autofunction:: paddle.fluid.io.load_vars .. autofunction:: paddle.fluid.io.load_vars
:noindex: :noindex:
.. _api_fluid_io_load_params:
load_params load_params
----------- -----------
.. autofunction:: paddle.fluid.io.load_params .. autofunction:: paddle.fluid.io.load_params
:noindex: :noindex:
.. _api_fluid_io_load_persistables:
load_persistables load_persistables
----------------- -----------------
.. autofunction:: paddle.fluid.io.load_persistables .. autofunction:: paddle.fluid.io.load_persistables
:noindex: :noindex:
.. _api_fluid_io_save_inference_model:
save_inference_model save_inference_model
-------------------- --------------------
.. autofunction:: paddle.fluid.io.save_inference_model .. autofunction:: paddle.fluid.io.save_inference_model
:noindex: :noindex:
.. _api_fluid_io_load_inference_model:
load_inference_model load_inference_model
-------------------- --------------------
.. autofunction:: paddle.fluid.io.load_inference_model .. autofunction:: paddle.fluid.io.load_inference_model
:noindex: :noindex:
.. _api_fluid_io_get_inference_program:
get_inference_program get_inference_program
--------------------- ---------------------
.. autofunction:: paddle.fluid.io.get_inference_program .. autofunction:: paddle.fluid.io.get_inference_program
:noindex: :noindex:
.. _api_fluid_io_save_checkpoint:
save_checkpoint save_checkpoint
--------------- ---------------
.. autofunction:: paddle.fluid.io.save_checkpoint .. autofunction:: paddle.fluid.io.save_checkpoint
:noindex: :noindex:
.. _api_fluid_io_load_checkpoint:
load_checkpoint load_checkpoint
--------------- ---------------
.. autofunction:: paddle.fluid.io.load_checkpoint .. autofunction:: paddle.fluid.io.load_checkpoint
:noindex: :noindex:
.. _api_fluid_io_clean_checkpoint:
clean_checkpoint clean_checkpoint
---------------- ----------------
.. autofunction:: paddle.fluid.io.clean_checkpoint .. autofunction:: paddle.fluid.io.clean_checkpoint
:noindex: :noindex:
.. _api_fluid_io_load_persist_vars_without_grad:
load_persist_vars_without_grad load_persist_vars_without_grad
------------------------------ ------------------------------
.. autofunction:: paddle.fluid.io.load_persist_vars_without_grad .. autofunction:: paddle.fluid.io.load_persist_vars_without_grad
:noindex: :noindex:
.. _api_fluid_io_save_persist_vars_without_grad:
save_persist_vars_without_grad save_persist_vars_without_grad
------------------------------ ------------------------------
.. autofunction:: paddle.fluid.io.save_persist_vars_without_grad .. autofunction:: paddle.fluid.io.save_persist_vars_without_grad
:noindex: :noindex:
.. _api_fluid_io_get_latest_checkpoint_serial:
get_latest_checkpoint_serial get_latest_checkpoint_serial
---------------------------- ----------------------------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
====== ============
layers fluid.layers
====== ============
control_flow control_flow
============ ============
.. _api_fluid_layers_split_lod_tensor:
split_lod_tensor split_lod_tensor
---------------- ----------------
.. autofunction:: paddle.fluid.layers.split_lod_tensor .. autofunction:: paddle.fluid.layers.split_lod_tensor
:noindex: :noindex:
.. _api_fluid_layers_merge_lod_tensor:
merge_lod_tensor merge_lod_tensor
---------------- ----------------
.. autofunction:: paddle.fluid.layers.merge_lod_tensor .. autofunction:: paddle.fluid.layers.merge_lod_tensor
:noindex: :noindex:
.. _api_fluid_layers_BlockGuard:
BlockGuard BlockGuard
---------- ----------
...@@ -27,6 +33,8 @@ BlockGuard ...@@ -27,6 +33,8 @@ BlockGuard
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_BlockGuardWithCompletion:
BlockGuardWithCompletion BlockGuardWithCompletion
------------------------ ------------------------
...@@ -34,12 +42,7 @@ BlockGuardWithCompletion ...@@ -34,12 +42,7 @@ BlockGuardWithCompletion
:members: :members:
:noindex: :noindex:
StaticRNNMemoryLink .. _api_fluid_layers_WhileGuard:
-------------------
.. autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
:members:
:noindex:
WhileGuard WhileGuard
---------- ----------
...@@ -48,6 +51,8 @@ WhileGuard ...@@ -48,6 +51,8 @@ WhileGuard
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_While:
While While
----- -----
...@@ -55,6 +60,8 @@ While ...@@ -55,6 +60,8 @@ While
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_Switch:
Switch Switch
------ ------
...@@ -62,78 +69,104 @@ Switch ...@@ -62,78 +69,104 @@ Switch
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_lod_rank_table:
lod_rank_table lod_rank_table
-------------- --------------
.. autofunction:: paddle.fluid.layers.lod_rank_table .. autofunction:: paddle.fluid.layers.lod_rank_table
:noindex: :noindex:
.. _api_fluid_layers_max_sequence_len:
max_sequence_len max_sequence_len
---------------- ----------------
.. autofunction:: paddle.fluid.layers.max_sequence_len .. autofunction:: paddle.fluid.layers.max_sequence_len
:noindex: :noindex:
.. _api_fluid_layers_lod_tensor_to_array:
lod_tensor_to_array lod_tensor_to_array
------------------- -------------------
.. autofunction:: paddle.fluid.layers.lod_tensor_to_array .. autofunction:: paddle.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
.. _api_fluid_layers_array_to_lod_tensor:
array_to_lod_tensor array_to_lod_tensor
------------------- -------------------
.. autofunction:: paddle.fluid.layers.array_to_lod_tensor .. autofunction:: paddle.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
.. _api_fluid_layers_increment:
increment increment
--------- ---------
.. autofunction:: paddle.fluid.layers.increment .. autofunction:: paddle.fluid.layers.increment
:noindex: :noindex:
.. _api_fluid_layers_array_write:
array_write array_write
----------- -----------
.. autofunction:: paddle.fluid.layers.array_write .. autofunction:: paddle.fluid.layers.array_write
:noindex: :noindex:
.. _api_fluid_layers_create_array:
create_array create_array
------------ ------------
.. autofunction:: paddle.fluid.layers.create_array .. autofunction:: paddle.fluid.layers.create_array
:noindex: :noindex:
.. _api_fluid_layers_less_than:
less_than less_than
--------- ---------
.. autofunction:: paddle.fluid.layers.less_than .. autofunction:: paddle.fluid.layers.less_than
:noindex: :noindex:
.. _api_fluid_layers_equal:
equal equal
----- -----
.. autofunction:: paddle.fluid.layers.equal .. autofunction:: paddle.fluid.layers.equal
:noindex: :noindex:
.. _api_fluid_layers_array_read:
array_read array_read
---------- ----------
.. autofunction:: paddle.fluid.layers.array_read .. autofunction:: paddle.fluid.layers.array_read
:noindex: :noindex:
.. _api_fluid_layers_shrink_memory:
shrink_memory shrink_memory
------------- -------------
.. autofunction:: paddle.fluid.layers.shrink_memory .. autofunction:: paddle.fluid.layers.shrink_memory
:noindex: :noindex:
.. _api_fluid_layers_array_length:
array_length array_length
------------ ------------
.. autofunction:: paddle.fluid.layers.array_length .. autofunction:: paddle.fluid.layers.array_length
:noindex: :noindex:
.. _api_fluid_layers_IfElse:
IfElse IfElse
------ ------
...@@ -141,6 +174,8 @@ IfElse ...@@ -141,6 +174,8 @@ IfElse
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_DynamicRNN:
DynamicRNN DynamicRNN
---------- ----------
...@@ -148,6 +183,8 @@ DynamicRNN ...@@ -148,6 +183,8 @@ DynamicRNN
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_ConditionalBlock:
ConditionalBlock ConditionalBlock
---------------- ----------------
...@@ -155,6 +192,8 @@ ConditionalBlock ...@@ -155,6 +192,8 @@ ConditionalBlock
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_StaticRNN:
StaticRNN StaticRNN
--------- ---------
...@@ -162,12 +201,16 @@ StaticRNN ...@@ -162,12 +201,16 @@ StaticRNN
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_reorder_lod_tensor_by_rank:
reorder_lod_tensor_by_rank reorder_lod_tensor_by_rank
-------------------------- --------------------------
.. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank .. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
:noindex: :noindex:
.. _api_fluid_layers_ParallelDo:
ParallelDo ParallelDo
---------- ----------
...@@ -175,12 +218,16 @@ ParallelDo ...@@ -175,12 +218,16 @@ ParallelDo
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_Print:
Print Print
----- -----
.. autofunction:: paddle.fluid.layers.Print .. autofunction:: paddle.fluid.layers.Print
:noindex: :noindex:
.. _api_fluid_layers_is_empty:
is_empty is_empty
-------- --------
...@@ -190,6 +237,8 @@ is_empty ...@@ -190,6 +237,8 @@ is_empty
device device
====== ======
.. _api_fluid_layers_get_places:
get_places get_places
---------- ----------
...@@ -199,12 +248,16 @@ get_places ...@@ -199,12 +248,16 @@ get_places
io io
== ==
.. _api_fluid_layers_data:
data data
---- ----
.. autofunction:: paddle.fluid.layers.data .. autofunction:: paddle.fluid.layers.data
:noindex: :noindex:
.. _api_fluid_layers_BlockGuardServ:
BlockGuardServ BlockGuardServ
-------------- --------------
...@@ -212,6 +265,8 @@ BlockGuardServ ...@@ -212,6 +265,8 @@ BlockGuardServ
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_ListenAndServ:
ListenAndServ ListenAndServ
------------- -------------
...@@ -219,60 +274,80 @@ ListenAndServ ...@@ -219,60 +274,80 @@ ListenAndServ
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_Send:
Send Send
---- ----
.. autofunction:: paddle.fluid.layers.Send .. autofunction:: paddle.fluid.layers.Send
:noindex: :noindex:
.. _api_fluid_layers_Recv:
Recv Recv
---- ----
.. autofunction:: paddle.fluid.layers.Recv .. autofunction:: paddle.fluid.layers.Recv
:noindex: :noindex:
.. _api_fluid_layers_open_recordio_file:
open_recordio_file open_recordio_file
------------------ ------------------
.. autofunction:: paddle.fluid.layers.open_recordio_file .. autofunction:: paddle.fluid.layers.open_recordio_file
:noindex: :noindex:
.. _api_fluid_layers_open_files:
open_files open_files
---------- ----------
.. autofunction:: paddle.fluid.layers.open_files .. autofunction:: paddle.fluid.layers.open_files
:noindex: :noindex:
.. _api_fluid_layers_read_file:
read_file read_file
--------- ---------
.. autofunction:: paddle.fluid.layers.read_file .. autofunction:: paddle.fluid.layers.read_file
:noindex: :noindex:
.. _api_fluid_layers_shuffle:
shuffle shuffle
------- -------
.. autofunction:: paddle.fluid.layers.shuffle .. autofunction:: paddle.fluid.layers.shuffle
:noindex: :noindex:
.. _api_fluid_layers_batch:
batch batch
----- -----
.. autofunction:: paddle.fluid.layers.batch .. autofunction:: paddle.fluid.layers.batch
:noindex: :noindex:
.. _api_fluid_layers_double_buffer:
double_buffer double_buffer
------------- -------------
.. autofunction:: paddle.fluid.layers.double_buffer .. autofunction:: paddle.fluid.layers.double_buffer
:noindex: :noindex:
.. _api_fluid_layers_random_data_generator:
random_data_generator random_data_generator
--------------------- ---------------------
.. autofunction:: paddle.fluid.layers.random_data_generator .. autofunction:: paddle.fluid.layers.random_data_generator
:noindex: :noindex:
.. _api_fluid_layers_Preprocessor:
Preprocessor Preprocessor
------------ ------------
...@@ -280,6 +355,8 @@ Preprocessor ...@@ -280,6 +355,8 @@ Preprocessor
:members: :members:
:noindex: :noindex:
.. _api_fluid_layers_load:
load load
---- ----
...@@ -289,584 +366,802 @@ load ...@@ -289,584 +366,802 @@ load
nn nn
== ==
.. _api_fluid_layers_fc:
fc fc
-- --
.. autofunction:: paddle.fluid.layers.fc .. autofunction:: paddle.fluid.layers.fc
:noindex: :noindex:
.. _api_fluid_layers_embedding:
embedding embedding
--------- ---------
.. autofunction:: paddle.fluid.layers.embedding .. autofunction:: paddle.fluid.layers.embedding
:noindex: :noindex:
.. _api_fluid_layers_dynamic_lstm:
dynamic_lstm dynamic_lstm
------------ ------------
.. autofunction:: paddle.fluid.layers.dynamic_lstm .. autofunction:: paddle.fluid.layers.dynamic_lstm
:noindex: :noindex:
.. _api_fluid_layers_dynamic_lstmp:
dynamic_lstmp dynamic_lstmp
------------- -------------
.. autofunction:: paddle.fluid.layers.dynamic_lstmp .. autofunction:: paddle.fluid.layers.dynamic_lstmp
:noindex: :noindex:
.. _api_fluid_layers_dynamic_gru:
dynamic_gru dynamic_gru
----------- -----------
.. autofunction:: paddle.fluid.layers.dynamic_gru .. autofunction:: paddle.fluid.layers.dynamic_gru
:noindex: :noindex:
.. _api_fluid_layers_gru_unit:
gru_unit gru_unit
-------- --------
.. autofunction:: paddle.fluid.layers.gru_unit .. autofunction:: paddle.fluid.layers.gru_unit
:noindex: :noindex:
.. _api_fluid_layers_linear_chain_crf:
linear_chain_crf linear_chain_crf
---------------- ----------------
.. autofunction:: paddle.fluid.layers.linear_chain_crf .. autofunction:: paddle.fluid.layers.linear_chain_crf
:noindex: :noindex:
.. _api_fluid_layers_crf_decoding:
crf_decoding crf_decoding
------------ ------------
.. autofunction:: paddle.fluid.layers.crf_decoding .. autofunction:: paddle.fluid.layers.crf_decoding
:noindex: :noindex:
.. _api_fluid_layers_cos_sim:
cos_sim cos_sim
------- -------
.. autofunction:: paddle.fluid.layers.cos_sim .. autofunction:: paddle.fluid.layers.cos_sim
:noindex: :noindex:
.. _api_fluid_layers_cross_entropy:
cross_entropy cross_entropy
------------- -------------
.. autofunction:: paddle.fluid.layers.cross_entropy .. autofunction:: paddle.fluid.layers.cross_entropy
:noindex: :noindex:
.. _api_fluid_layers_square_error_cost:
square_error_cost square_error_cost
----------------- -----------------
.. autofunction:: paddle.fluid.layers.square_error_cost .. autofunction:: paddle.fluid.layers.square_error_cost
:noindex: :noindex:
.. _api_fluid_layers_chunk_eval:
chunk_eval chunk_eval
---------- ----------
.. autofunction:: paddle.fluid.layers.chunk_eval .. autofunction:: paddle.fluid.layers.chunk_eval
:noindex: :noindex:
.. _api_fluid_layers_sequence_conv:
sequence_conv sequence_conv
------------- -------------
.. autofunction:: paddle.fluid.layers.sequence_conv .. autofunction:: paddle.fluid.layers.sequence_conv
:noindex: :noindex:
.. _api_fluid_layers_conv2d:
conv2d conv2d
------ ------
.. autofunction:: paddle.fluid.layers.conv2d .. autofunction:: paddle.fluid.layers.conv2d
:noindex: :noindex:
.. _api_fluid_layers_conv3d:
conv3d conv3d
------ ------
.. autofunction:: paddle.fluid.layers.conv3d .. autofunction:: paddle.fluid.layers.conv3d
:noindex: :noindex:
.. _api_fluid_layers_sequence_pool:
sequence_pool sequence_pool
------------- -------------
.. autofunction:: paddle.fluid.layers.sequence_pool .. autofunction:: paddle.fluid.layers.sequence_pool
:noindex: :noindex:
.. _api_fluid_layers_sequence_softmax:
sequence_softmax sequence_softmax
---------------- ----------------
.. autofunction:: paddle.fluid.layers.sequence_softmax .. autofunction:: paddle.fluid.layers.sequence_softmax
:noindex: :noindex:
.. _api_fluid_layers_softmax:
softmax softmax
------- -------
.. autofunction:: paddle.fluid.layers.softmax .. autofunction:: paddle.fluid.layers.softmax
:noindex: :noindex:
.. _api_fluid_layers_pool2d:
pool2d pool2d
------ ------
.. autofunction:: paddle.fluid.layers.pool2d .. autofunction:: paddle.fluid.layers.pool2d
:noindex: :noindex:
.. _api_fluid_layers_pool3d:
pool3d pool3d
------ ------
.. autofunction:: paddle.fluid.layers.pool3d .. autofunction:: paddle.fluid.layers.pool3d
:noindex: :noindex:
.. _api_fluid_layers_batch_norm:
batch_norm batch_norm
---------- ----------
.. autofunction:: paddle.fluid.layers.batch_norm .. autofunction:: paddle.fluid.layers.batch_norm
:noindex: :noindex:
.. _api_fluid_layers_beam_search_decode:
beam_search_decode beam_search_decode
------------------ ------------------
.. autofunction:: paddle.fluid.layers.beam_search_decode .. autofunction:: paddle.fluid.layers.beam_search_decode
:noindex: :noindex:
.. _api_fluid_layers_conv2d_transpose:
conv2d_transpose conv2d_transpose
---------------- ----------------
.. autofunction:: paddle.fluid.layers.conv2d_transpose .. autofunction:: paddle.fluid.layers.conv2d_transpose
:noindex: :noindex:
.. _api_fluid_layers_conv3d_transpose:
conv3d_transpose conv3d_transpose
---------------- ----------------
.. autofunction:: paddle.fluid.layers.conv3d_transpose .. autofunction:: paddle.fluid.layers.conv3d_transpose
:noindex: :noindex:
.. _api_fluid_layers_sequence_expand:
sequence_expand sequence_expand
--------------- ---------------
.. autofunction:: paddle.fluid.layers.sequence_expand .. autofunction:: paddle.fluid.layers.sequence_expand
:noindex: :noindex:
.. _api_fluid_layers_lstm_unit:
lstm_unit lstm_unit
--------- ---------
.. autofunction:: paddle.fluid.layers.lstm_unit .. autofunction:: paddle.fluid.layers.lstm_unit
:noindex: :noindex:
.. _api_fluid_layers_reduce_sum:
reduce_sum reduce_sum
---------- ----------
.. autofunction:: paddle.fluid.layers.reduce_sum .. autofunction:: paddle.fluid.layers.reduce_sum
:noindex: :noindex:
.. _api_fluid_layers_reduce_mean:
reduce_mean reduce_mean
----------- -----------
.. autofunction:: paddle.fluid.layers.reduce_mean .. autofunction:: paddle.fluid.layers.reduce_mean
:noindex: :noindex:
.. _api_fluid_layers_reduce_max:
reduce_max reduce_max
---------- ----------
.. autofunction:: paddle.fluid.layers.reduce_max .. autofunction:: paddle.fluid.layers.reduce_max
:noindex: :noindex:
.. _api_fluid_layers_reduce_min:
reduce_min reduce_min
---------- ----------
.. autofunction:: paddle.fluid.layers.reduce_min .. autofunction:: paddle.fluid.layers.reduce_min
:noindex: :noindex:
.. _api_fluid_layers_reduce_prod:
reduce_prod reduce_prod
----------- -----------
.. autofunction:: paddle.fluid.layers.reduce_prod .. autofunction:: paddle.fluid.layers.reduce_prod
:noindex: :noindex:
.. _api_fluid_layers_sequence_first_step:
sequence_first_step sequence_first_step
------------------- -------------------
.. autofunction:: paddle.fluid.layers.sequence_first_step .. autofunction:: paddle.fluid.layers.sequence_first_step
:noindex: :noindex:
.. _api_fluid_layers_sequence_last_step:
sequence_last_step sequence_last_step
------------------ ------------------
.. autofunction:: paddle.fluid.layers.sequence_last_step .. autofunction:: paddle.fluid.layers.sequence_last_step
:noindex: :noindex:
.. _api_fluid_layers_dropout:
dropout dropout
------- -------
.. autofunction:: paddle.fluid.layers.dropout .. autofunction:: paddle.fluid.layers.dropout
:noindex: :noindex:
.. _api_fluid_layers_split:
split split
----- -----
.. autofunction:: paddle.fluid.layers.split .. autofunction:: paddle.fluid.layers.split
:noindex: :noindex:
.. _api_fluid_layers_ctc_greedy_decoder:
ctc_greedy_decoder ctc_greedy_decoder
------------------ ------------------
.. autofunction:: paddle.fluid.layers.ctc_greedy_decoder .. autofunction:: paddle.fluid.layers.ctc_greedy_decoder
:noindex: :noindex:
.. _api_fluid_layers_edit_distance:
edit_distance edit_distance
------------- -------------
.. autofunction:: paddle.fluid.layers.edit_distance .. autofunction:: paddle.fluid.layers.edit_distance
:noindex: :noindex:
.. _api_fluid_layers_l2_normalize:
l2_normalize l2_normalize
------------ ------------
.. autofunction:: paddle.fluid.layers.l2_normalize .. autofunction:: paddle.fluid.layers.l2_normalize
:noindex: :noindex:
.. _api_fluid_layers_matmul:
matmul matmul
------ ------
.. autofunction:: paddle.fluid.layers.matmul .. autofunction:: paddle.fluid.layers.matmul
:noindex: :noindex:
.. _api_fluid_layers_topk:
topk topk
---- ----
.. autofunction:: paddle.fluid.layers.topk .. autofunction:: paddle.fluid.layers.topk
:noindex: :noindex:
.. _api_fluid_layers_warpctc:
warpctc warpctc
------- -------
.. autofunction:: paddle.fluid.layers.warpctc .. autofunction:: paddle.fluid.layers.warpctc
:noindex: :noindex:
.. _api_fluid_layers_sequence_reshape:
sequence_reshape sequence_reshape
---------------- ----------------
.. autofunction:: paddle.fluid.layers.sequence_reshape .. autofunction:: paddle.fluid.layers.sequence_reshape
:noindex: :noindex:
.. _api_fluid_layers_transpose:
transpose transpose
--------- ---------
.. autofunction:: paddle.fluid.layers.transpose .. autofunction:: paddle.fluid.layers.transpose
:noindex: :noindex:
.. _api_fluid_layers_im2sequence:
im2sequence im2sequence
----------- -----------
.. autofunction:: paddle.fluid.layers.im2sequence .. autofunction:: paddle.fluid.layers.im2sequence
:noindex: :noindex:
.. _api_fluid_layers_nce:
nce nce
--- ---
.. autofunction:: paddle.fluid.layers.nce .. autofunction:: paddle.fluid.layers.nce
:noindex: :noindex:
.. _api_fluid_layers_beam_search:
beam_search beam_search
----------- -----------
.. autofunction:: paddle.fluid.layers.beam_search .. autofunction:: paddle.fluid.layers.beam_search
:noindex: :noindex:
.. _api_fluid_layers_row_conv:
row_conv row_conv
-------- --------
.. autofunction:: paddle.fluid.layers.row_conv .. autofunction:: paddle.fluid.layers.row_conv
:noindex: :noindex:
.. _api_fluid_layers_multiplex:
multiplex multiplex
--------- ---------
.. autofunction:: paddle.fluid.layers.multiplex .. autofunction:: paddle.fluid.layers.multiplex
:noindex: :noindex:
.. _api_fluid_layers_layer_norm:
layer_norm layer_norm
---------- ----------
.. autofunction:: paddle.fluid.layers.layer_norm .. autofunction:: paddle.fluid.layers.layer_norm
:noindex: :noindex:
.. _api_fluid_layers_softmax_with_cross_entropy:
softmax_with_cross_entropy softmax_with_cross_entropy
-------------------------- --------------------------
.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy .. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
:noindex: :noindex:
.. _api_fluid_layers_smooth_l1:
smooth_l1 smooth_l1
--------- ---------
.. autofunction:: paddle.fluid.layers.smooth_l1 .. autofunction:: paddle.fluid.layers.smooth_l1
:noindex: :noindex:
.. _api_fluid_layers_one_hot:
one_hot one_hot
------- -------
.. autofunction:: paddle.fluid.layers.one_hot .. autofunction:: paddle.fluid.layers.one_hot
:noindex: :noindex:
.. _api_fluid_layers_autoincreased_step_counter:
autoincreased_step_counter autoincreased_step_counter
-------------------------- --------------------------
.. autofunction:: paddle.fluid.layers.autoincreased_step_counter .. autofunction:: paddle.fluid.layers.autoincreased_step_counter
:noindex: :noindex:
.. _api_fluid_layers_reshape:
reshape reshape
------- -------
.. autofunction:: paddle.fluid.layers.reshape .. autofunction:: paddle.fluid.layers.reshape
:noindex: :noindex:
.. _api_fluid_layers_lod_reset:
lod_reset lod_reset
--------- ---------
.. autofunction:: paddle.fluid.layers.lod_reset .. autofunction:: paddle.fluid.layers.lod_reset
:noindex: :noindex:
.. _api_fluid_layers_lrn:
lrn lrn
--- ---
.. autofunction:: paddle.fluid.layers.lrn .. autofunction:: paddle.fluid.layers.lrn
:noindex: :noindex:
.. _api_fluid_layers_pad:
pad pad
--- ---
.. autofunction:: paddle.fluid.layers.pad .. autofunction:: paddle.fluid.layers.pad
:noindex: :noindex:
.. _api_fluid_layers_label_smooth:
label_smooth label_smooth
------------ ------------
.. autofunction:: paddle.fluid.layers.label_smooth .. autofunction:: paddle.fluid.layers.label_smooth
:noindex: :noindex:
.. _api_fluid_layers_roi_pool:
roi_pool roi_pool
-------- --------
.. autofunction:: paddle.fluid.layers.roi_pool .. autofunction:: paddle.fluid.layers.roi_pool
:noindex: :noindex:
.. _api_fluid_layers_dice_loss:
dice_loss dice_loss
--------- ---------
.. autofunction:: paddle.fluid.layers.dice_loss .. autofunction:: paddle.fluid.layers.dice_loss
:noindex: :noindex:
.. _api_fluid_layers_image_resize:
image_resize image_resize
------------ ------------
.. autofunction:: paddle.fluid.layers.image_resize .. autofunction:: paddle.fluid.layers.image_resize
:noindex: :noindex:
.. _api_fluid_layers_image_resize_short:
image_resize_short image_resize_short
------------------ ------------------
.. autofunction:: paddle.fluid.layers.image_resize_short .. autofunction:: paddle.fluid.layers.image_resize_short
:noindex: :noindex:
.. _api_fluid_layers_resize_bilinear:
resize_bilinear resize_bilinear
--------------- ---------------
.. autofunction:: paddle.fluid.layers.resize_bilinear .. autofunction:: paddle.fluid.layers.resize_bilinear
:noindex: :noindex:
.. _api_fluid_layers_gather:
gather gather
------ ------
.. autofunction:: paddle.fluid.layers.gather .. autofunction:: paddle.fluid.layers.gather
:noindex: :noindex:
.. _api_fluid_layers_random_crop:
random_crop random_crop
----------- -----------
.. autofunction:: paddle.fluid.layers.random_crop .. autofunction:: paddle.fluid.layers.random_crop
:noindex: :noindex:
.. _api_fluid_layers_mean_iou:
mean_iou mean_iou
-------- --------
.. autofunction:: paddle.fluid.layers.mean_iou .. autofunction:: paddle.fluid.layers.mean_iou
:noindex: :noindex:
.. _api_fluid_layers_relu:
relu
----
.. autofunction:: paddle.fluid.layers.relu
:noindex:
.. _api_fluid_layers_log:
log
---
.. autofunction:: paddle.fluid.layers.log
:noindex:
.. _api_fluid_layers_crop:
crop
----
.. autofunction:: paddle.fluid.layers.crop
:noindex:
ops ops
=== ===
.. _api_fluid_layers_mean:
mean mean
---- ----
.. autofunction:: paddle.fluid.layers.mean .. autofunction:: paddle.fluid.layers.mean
:noindex: :noindex:
.. _api_fluid_layers_mul:
mul mul
--- ---
.. autofunction:: paddle.fluid.layers.mul .. autofunction:: paddle.fluid.layers.mul
:noindex: :noindex:
.. _api_fluid_layers_scale:
scale scale
----- -----
.. autofunction:: paddle.fluid.layers.scale .. autofunction:: paddle.fluid.layers.scale
:noindex: :noindex:
.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
sigmoid_cross_entropy_with_logits sigmoid_cross_entropy_with_logits
--------------------------------- ---------------------------------
.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits .. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
:noindex: :noindex:
.. _api_fluid_layers_elementwise_add:
elementwise_add elementwise_add
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_add .. autofunction:: paddle.fluid.layers.elementwise_add
:noindex: :noindex:
.. _api_fluid_layers_elementwise_div:
elementwise_div elementwise_div
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_div .. autofunction:: paddle.fluid.layers.elementwise_div
:noindex: :noindex:
.. _api_fluid_layers_elementwise_sub:
elementwise_sub elementwise_sub
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_sub .. autofunction:: paddle.fluid.layers.elementwise_sub
:noindex: :noindex:
.. _api_fluid_layers_elementwise_mul:
elementwise_mul elementwise_mul
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_mul .. autofunction:: paddle.fluid.layers.elementwise_mul
:noindex: :noindex:
.. _api_fluid_layers_elementwise_max:
elementwise_max elementwise_max
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_max .. autofunction:: paddle.fluid.layers.elementwise_max
:noindex: :noindex:
.. _api_fluid_layers_elementwise_min:
elementwise_min elementwise_min
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_min .. autofunction:: paddle.fluid.layers.elementwise_min
:noindex: :noindex:
.. _api_fluid_layers_elementwise_pow:
elementwise_pow elementwise_pow
--------------- ---------------
.. autofunction:: paddle.fluid.layers.elementwise_pow .. autofunction:: paddle.fluid.layers.elementwise_pow
:noindex: :noindex:
.. _api_fluid_layers_clip:
clip clip
---- ----
.. autofunction:: paddle.fluid.layers.clip .. autofunction:: paddle.fluid.layers.clip
:noindex: :noindex:
.. _api_fluid_layers_clip_by_norm:
clip_by_norm clip_by_norm
------------ ------------
.. autofunction:: paddle.fluid.layers.clip_by_norm .. autofunction:: paddle.fluid.layers.clip_by_norm
:noindex: :noindex:
.. _api_fluid_layers_logical_and:
logical_and logical_and
----------- -----------
.. autofunction:: paddle.fluid.layers.logical_and .. autofunction:: paddle.fluid.layers.logical_and
:noindex: :noindex:
.. _api_fluid_layers_logical_or:
logical_or logical_or
---------- ----------
.. autofunction:: paddle.fluid.layers.logical_or .. autofunction:: paddle.fluid.layers.logical_or
:noindex: :noindex:
.. _api_fluid_layers_logical_xor:
logical_xor logical_xor
----------- -----------
.. autofunction:: paddle.fluid.layers.logical_xor .. autofunction:: paddle.fluid.layers.logical_xor
:noindex: :noindex:
.. _api_fluid_layers_logical_not:
logical_not logical_not
----------- -----------
.. autofunction:: paddle.fluid.layers.logical_not .. autofunction:: paddle.fluid.layers.logical_not
:noindex: :noindex:
.. _api_fluid_layers_uniform_random_batch_size_like:
uniform_random_batch_size_like uniform_random_batch_size_like
------------------------------ ------------------------------
.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like .. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
:noindex: :noindex:
.. _api_fluid_layers_gaussian_random:
gaussian_random gaussian_random
--------------- ---------------
.. autofunction:: paddle.fluid.layers.gaussian_random .. autofunction:: paddle.fluid.layers.gaussian_random
:noindex: :noindex:
.. _api_fluid_layers_gaussian_random_batch_size_like:
gaussian_random_batch_size_like gaussian_random_batch_size_like
------------------------------- -------------------------------
.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like .. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
:noindex: :noindex:
.. _api_fluid_layers_scatter:
scatter scatter
------- -------
.. autofunction:: paddle.fluid.layers.scatter .. autofunction:: paddle.fluid.layers.scatter
:noindex: :noindex:
.. _api_fluid_layers_sum:
sum sum
--- ---
.. autofunction:: paddle.fluid.layers.sum .. autofunction:: paddle.fluid.layers.sum
:noindex: :noindex:
.. _api_fluid_layers_slice:
slice slice
----- -----
.. autofunction:: paddle.fluid.layers.slice .. autofunction:: paddle.fluid.layers.slice
:noindex: :noindex:
.. _api_fluid_layers_polygon_box_transform:
polygon_box_transform polygon_box_transform
--------------------- ---------------------
.. autofunction:: paddle.fluid.layers.polygon_box_transform .. autofunction:: paddle.fluid.layers.polygon_box_transform
:noindex: :noindex:
.. _api_fluid_layers_shape:
shape shape
----- -----
.. autofunction:: paddle.fluid.layers.shape .. autofunction:: paddle.fluid.layers.shape
:noindex: :noindex:
.. _api_fluid_layers_iou_similarity:
iou_similarity
--------------
.. autofunction:: paddle.fluid.layers.iou_similarity
:noindex:
.. _api_fluid_layers_maxout:
maxout maxout
------ ------
.. autofunction:: paddle.fluid.layers.maxout .. autofunction:: paddle.fluid.layers.maxout
:noindex: :noindex:
.. _api_fluid_layers_sigmoid:
sigmoid sigmoid
------- -------
.. autofunction:: paddle.fluid.layers.sigmoid .. autofunction:: paddle.fluid.layers.sigmoid
:noindex: :noindex:
.. _api_fluid_layers_logsigmoid:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.fluid.layers.logsigmoid .. autofunction:: paddle.fluid.layers.logsigmoid
:noindex: :noindex:
.. _api_fluid_layers_exp:
exp exp
--- ---
.. autofunction:: paddle.fluid.layers.exp .. autofunction:: paddle.fluid.layers.exp
:noindex: :noindex:
relu .. _api_fluid_layers_tanh:
----
.. autofunction:: paddle.fluid.layers.relu
:noindex:
tanh tanh
---- ----
...@@ -874,71 +1169,87 @@ tanh ...@@ -874,71 +1169,87 @@ tanh
.. autofunction:: paddle.fluid.layers.tanh .. autofunction:: paddle.fluid.layers.tanh
:noindex: :noindex:
.. _api_fluid_layers_tanh_shrink:
tanh_shrink tanh_shrink
----------- -----------
.. autofunction:: paddle.fluid.layers.tanh_shrink .. autofunction:: paddle.fluid.layers.tanh_shrink
:noindex: :noindex:
.. _api_fluid_layers_softshrink:
softshrink softshrink
---------- ----------
.. autofunction:: paddle.fluid.layers.softshrink .. autofunction:: paddle.fluid.layers.softshrink
:noindex: :noindex:
.. _api_fluid_layers_sqrt:
sqrt sqrt
---- ----
.. autofunction:: paddle.fluid.layers.sqrt .. autofunction:: paddle.fluid.layers.sqrt
:noindex: :noindex:
.. _api_fluid_layers_abs:
abs abs
--- ---
.. autofunction:: paddle.fluid.layers.abs .. autofunction:: paddle.fluid.layers.abs
:noindex: :noindex:
.. _api_fluid_layers_ceil:
ceil ceil
---- ----
.. autofunction:: paddle.fluid.layers.ceil .. autofunction:: paddle.fluid.layers.ceil
:noindex: :noindex:
.. _api_fluid_layers_floor:
floor floor
----- -----
.. autofunction:: paddle.fluid.layers.floor .. autofunction:: paddle.fluid.layers.floor
:noindex: :noindex:
.. _api_fluid_layers_cos:
cos cos
--- ---
.. autofunction:: paddle.fluid.layers.cos .. autofunction:: paddle.fluid.layers.cos
:noindex: :noindex:
.. _api_fluid_layers_sin:
sin sin
--- ---
.. autofunction:: paddle.fluid.layers.sin .. autofunction:: paddle.fluid.layers.sin
:noindex: :noindex:
.. _api_fluid_layers_round:
round round
----- -----
.. autofunction:: paddle.fluid.layers.round .. autofunction:: paddle.fluid.layers.round
:noindex: :noindex:
.. _api_fluid_layers_reciprocal:
reciprocal reciprocal
---------- ----------
.. autofunction:: paddle.fluid.layers.reciprocal .. autofunction:: paddle.fluid.layers.reciprocal
:noindex: :noindex:
log .. _api_fluid_layers_square:
---
.. autofunction:: paddle.fluid.layers.log
:noindex:
square square
------ ------
...@@ -946,90 +1257,120 @@ square ...@@ -946,90 +1257,120 @@ square
.. autofunction:: paddle.fluid.layers.square .. autofunction:: paddle.fluid.layers.square
:noindex: :noindex:
.. _api_fluid_layers_softplus:
softplus softplus
-------- --------
.. autofunction:: paddle.fluid.layers.softplus .. autofunction:: paddle.fluid.layers.softplus
:noindex: :noindex:
.. _api_fluid_layers_softsign:
softsign softsign
-------- --------
.. autofunction:: paddle.fluid.layers.softsign .. autofunction:: paddle.fluid.layers.softsign
:noindex: :noindex:
.. _api_fluid_layers_brelu:
brelu brelu
----- -----
.. autofunction:: paddle.fluid.layers.brelu .. autofunction:: paddle.fluid.layers.brelu
:noindex: :noindex:
.. _api_fluid_layers_leaky_relu:
leaky_relu leaky_relu
---------- ----------
.. autofunction:: paddle.fluid.layers.leaky_relu .. autofunction:: paddle.fluid.layers.leaky_relu
:noindex: :noindex:
.. _api_fluid_layers_soft_relu:
soft_relu soft_relu
--------- ---------
.. autofunction:: paddle.fluid.layers.soft_relu .. autofunction:: paddle.fluid.layers.soft_relu
:noindex: :noindex:
.. _api_fluid_layers_elu:
elu elu
--- ---
.. autofunction:: paddle.fluid.layers.elu .. autofunction:: paddle.fluid.layers.elu
:noindex: :noindex:
.. _api_fluid_layers_relu6:
relu6 relu6
----- -----
.. autofunction:: paddle.fluid.layers.relu6 .. autofunction:: paddle.fluid.layers.relu6
:noindex: :noindex:
.. _api_fluid_layers_pow:
pow pow
--- ---
.. autofunction:: paddle.fluid.layers.pow .. autofunction:: paddle.fluid.layers.pow
:noindex: :noindex:
.. _api_fluid_layers_stanh:
stanh stanh
----- -----
.. autofunction:: paddle.fluid.layers.stanh .. autofunction:: paddle.fluid.layers.stanh
:noindex: :noindex:
.. _api_fluid_layers_hard_sigmoid:
hard_sigmoid hard_sigmoid
------------ ------------
.. autofunction:: paddle.fluid.layers.hard_sigmoid .. autofunction:: paddle.fluid.layers.hard_sigmoid
:noindex: :noindex:
.. _api_fluid_layers_swish:
swish swish
----- -----
.. autofunction:: paddle.fluid.layers.swish .. autofunction:: paddle.fluid.layers.swish
:noindex: :noindex:
.. _api_fluid_layers_uniform_random:
uniform_random uniform_random
-------------- --------------
.. autofunction:: paddle.fluid.layers.uniform_random .. autofunction:: paddle.fluid.layers.uniform_random
:noindex: :noindex:
.. _api_fluid_layers_hard_shrink:
hard_shrink hard_shrink
----------- -----------
.. autofunction:: paddle.fluid.layers.hard_shrink .. autofunction:: paddle.fluid.layers.hard_shrink
:noindex: :noindex:
.. _api_fluid_layers_cumsum:
cumsum cumsum
------ ------
.. autofunction:: paddle.fluid.layers.cumsum .. autofunction:: paddle.fluid.layers.cumsum
:noindex: :noindex:
.. _api_fluid_layers_thresholded_relu:
thresholded_relu thresholded_relu
---------------- ----------------
...@@ -1039,198 +1380,391 @@ thresholded_relu ...@@ -1039,198 +1380,391 @@ thresholded_relu
tensor tensor
====== ======
.. _api_fluid_layers_create_tensor:
create_tensor create_tensor
------------- -------------
.. autofunction:: paddle.fluid.layers.create_tensor .. autofunction:: paddle.fluid.layers.create_tensor
:noindex: :noindex:
.. _api_fluid_layers_create_parameter:
create_parameter create_parameter
---------------- ----------------
.. autofunction:: paddle.fluid.layers.create_parameter .. autofunction:: paddle.fluid.layers.create_parameter
:noindex: :noindex:
.. _api_fluid_layers_create_global_var:
create_global_var create_global_var
----------------- -----------------
.. autofunction:: paddle.fluid.layers.create_global_var .. autofunction:: paddle.fluid.layers.create_global_var
:noindex: :noindex:
.. _api_fluid_layers_cast:
cast cast
---- ----
.. autofunction:: paddle.fluid.layers.cast .. autofunction:: paddle.fluid.layers.cast
:noindex: :noindex:
.. _api_fluid_layers_concat:
concat concat
------ ------
.. autofunction:: paddle.fluid.layers.concat .. autofunction:: paddle.fluid.layers.concat
:noindex: :noindex:
.. _api_fluid_layers_sums:
sums sums
---- ----
.. autofunction:: paddle.fluid.layers.sums .. autofunction:: paddle.fluid.layers.sums
:noindex: :noindex:
.. _api_fluid_layers_assign:
assign assign
------ ------
.. autofunction:: paddle.fluid.layers.assign .. autofunction:: paddle.fluid.layers.assign
:noindex: :noindex:
.. _api_fluid_layers_fill_constant_batch_size_like:
fill_constant_batch_size_like fill_constant_batch_size_like
----------------------------- -----------------------------
.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like .. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
:noindex: :noindex:
.. _api_fluid_layers_fill_constant:
fill_constant fill_constant
------------- -------------
.. autofunction:: paddle.fluid.layers.fill_constant .. autofunction:: paddle.fluid.layers.fill_constant
:noindex: :noindex:
.. _api_fluid_layers_argmin:
argmin argmin
------ ------
.. autofunction:: paddle.fluid.layers.argmin .. autofunction:: paddle.fluid.layers.argmin
:noindex: :noindex:
.. _api_fluid_layers_argmax:
argmax argmax
------ ------
.. autofunction:: paddle.fluid.layers.argmax .. autofunction:: paddle.fluid.layers.argmax
:noindex: :noindex:
.. _api_fluid_layers_argsort:
argsort argsort
------ -------
.. autofunction:: paddle.fluid.layers.argsort .. autofunction:: paddle.fluid.layers.argsort
:noindex: :noindex:
.. _api_fluid_layers_ones:
ones ones
---- ----
.. autofunction:: paddle.fluid.layers.ones .. autofunction:: paddle.fluid.layers.ones
:noindex: :noindex:
.. _api_fluid_layers_zeros:
zeros zeros
----- -----
.. autofunction:: paddle.fluid.layers.zeros .. autofunction:: paddle.fluid.layers.zeros
:noindex: :noindex:
.. _api_fluid_layers_reverse:
reverse
-------
.. autofunction:: paddle.fluid.layers.reverse
:noindex:
learning_rate_scheduler
=======================
.. _api_fluid_layers_exponential_decay:
exponential_decay
-----------------
.. autofunction:: paddle.fluid.layers.exponential_decay
:noindex:
.. _api_fluid_layers_natural_exp_decay:
natural_exp_decay
-----------------
.. autofunction:: paddle.fluid.layers.natural_exp_decay
:noindex:
.. _api_fluid_layers_inverse_time_decay:
inverse_time_decay
------------------
.. autofunction:: paddle.fluid.layers.inverse_time_decay
:noindex:
.. _api_fluid_layers_polynomial_decay:
polynomial_decay
----------------
.. autofunction:: paddle.fluid.layers.polynomial_decay
:noindex:
.. _api_fluid_layers_piecewise_decay:
piecewise_decay
---------------
.. autofunction:: paddle.fluid.layers.piecewise_decay
:noindex:
.. _api_fluid_layers_noam_decay:
noam_decay
----------
.. autofunction:: paddle.fluid.layers.noam_decay
:noindex:
.. _api_fluid_layers_append_LARS:
append_LARS
-----------
.. autofunction:: paddle.fluid.layers.append_LARS
:noindex:
detection detection
========= =========
.. _api_fluid_layers_prior_box:
prior_box prior_box
--------- ---------
.. autofunction:: paddle.fluid.layers.prior_box .. autofunction:: paddle.fluid.layers.prior_box
:noindex: :noindex:
.. _api_fluid_layers_multi_box_head:
multi_box_head multi_box_head
-------------- --------------
.. autofunction:: paddle.fluid.layers.multi_box_head .. autofunction:: paddle.fluid.layers.multi_box_head
:noindex: :noindex:
.. _api_fluid_layers_bipartite_match:
bipartite_match bipartite_match
--------------- ---------------
.. autofunction:: paddle.fluid.layers.bipartite_match .. autofunction:: paddle.fluid.layers.bipartite_match
:noindex: :noindex:
.. _api_fluid_layers_target_assign:
target_assign target_assign
------------- -------------
.. autofunction:: paddle.fluid.layers.target_assign .. autofunction:: paddle.fluid.layers.target_assign
:noindex: :noindex:
.. _api_fluid_layers_detection_output:
detection_output detection_output
---------------- ----------------
.. autofunction:: paddle.fluid.layers.detection_output .. autofunction:: paddle.fluid.layers.detection_output
:noindex: :noindex:
.. _api_fluid_layers_ssd_loss:
ssd_loss ssd_loss
-------- --------
.. autofunction:: paddle.fluid.layers.ssd_loss .. autofunction:: paddle.fluid.layers.ssd_loss
:noindex: :noindex:
.. _api_fluid_layers_detection_map:
detection_map detection_map
------------- -------------
.. autofunction:: paddle.fluid.layers.detection_map .. autofunction:: paddle.fluid.layers.detection_map
:noindex: :noindex:
.. _api_fluid_layers_iou_similarity:
iou_similarity iou_similarity
-------------- --------------
.. autofunction:: paddle.fluid.layers.iou_similarity .. autofunction:: paddle.fluid.layers.iou_similarity
:noindex: :noindex:
.. _api_fluid_layers_box_coder:
box_coder box_coder
--------- ---------
.. autofunction:: paddle.fluid.layers.box_coder .. autofunction:: paddle.fluid.layers.box_coder
:noindex: :noindex:
learning_rate_scheduler metric_op
======================= =========
exponential_decay .. _api_fluid_layers_accuracy:
-----------------
.. autofunction:: paddle.fluid.layers.exponential_decay accuracy
--------
.. autofunction:: paddle.fluid.layers.accuracy
:noindex: :noindex:
natural_exp_decay .. _api_fluid_layers_auc:
-----------------
.. autofunction:: paddle.fluid.layers.natural_exp_decay auc
---
.. autofunction:: paddle.fluid.layers.auc
:noindex: :noindex:
inverse_time_decay tensor
------------------ ======
.. autofunction:: paddle.fluid.layers.inverse_time_decay .. _api_fluid_layers_create_tensor:
create_tensor
-------------
.. autofunction:: paddle.fluid.layers.create_tensor
:noindex: :noindex:
polynomial_decay .. _api_fluid_layers_create_parameter:
create_parameter
---------------- ----------------
.. autofunction:: paddle.fluid.layers.polynomial_decay .. autofunction:: paddle.fluid.layers.create_parameter
:noindex: :noindex:
piecewise_decay .. _api_fluid_layers_create_global_var:
---------------
.. autofunction:: paddle.fluid.layers.piecewise_decay create_global_var
-----------------
.. autofunction:: paddle.fluid.layers.create_global_var
:noindex: :noindex:
noam_decay .. _api_fluid_layers_cast:
----------
.. autofunction:: paddle.fluid.layers.noam_decay cast
----
.. autofunction:: paddle.fluid.layers.cast
:noindex: :noindex:
metric .. _api_fluid_layers_concat:
======
accuracy concat
-------- ------
.. autofunction:: paddle.fluid.layers.accuracy .. autofunction:: paddle.fluid.layers.concat
:noindex: :noindex:
auc .. _api_fluid_layers_sums:
---
.. autofunction:: paddle.fluid.layers.auc sums
----
.. autofunction:: paddle.fluid.layers.sums
:noindex:
.. _api_fluid_layers_assign:
assign
------
.. autofunction:: paddle.fluid.layers.assign
:noindex:
.. _api_fluid_layers_fill_constant_batch_size_like:
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
:noindex:
.. _api_fluid_layers_fill_constant:
fill_constant
-------------
.. autofunction:: paddle.fluid.layers.fill_constant
:noindex:
.. _api_fluid_layers_argmin:
argmin
------
.. autofunction:: paddle.fluid.layers.argmin
:noindex:
.. _api_fluid_layers_argmax:
argmax
------
.. autofunction:: paddle.fluid.layers.argmax
:noindex:
.. _api_fluid_layers_ones:
ones
----
.. autofunction:: paddle.fluid.layers.ones
:noindex:
.. _api_fluid_layers_zeros:
zeros
-----
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
.. _api_fluid_layers_reverse:
reverse
-------
.. autofunction:: paddle.fluid.layers.reverse
:noindex: :noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
======= =============
metrics fluid.metrics
======= =============
.. _api_fluid_metrics_MetricBase:
MetricBase MetricBase
---------- ----------
...@@ -12,6 +14,8 @@ MetricBase ...@@ -12,6 +14,8 @@ MetricBase
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_CompositeMetric:
CompositeMetric CompositeMetric
--------------- ---------------
...@@ -19,6 +23,26 @@ CompositeMetric ...@@ -19,6 +23,26 @@ CompositeMetric
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_Precision:
Precision
---------
.. autoclass:: paddle.fluid.metrics.Precision
:members:
:noindex:
.. _api_fluid_metrics_Recall:
Recall
------
.. autoclass:: paddle.fluid.metrics.Recall
:members:
:noindex:
.. _api_fluid_metrics_Accuracy:
Accuracy Accuracy
-------- --------
...@@ -26,6 +50,8 @@ Accuracy ...@@ -26,6 +50,8 @@ Accuracy
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_ChunkEvaluator:
ChunkEvaluator ChunkEvaluator
-------------- --------------
...@@ -33,6 +59,8 @@ ChunkEvaluator ...@@ -33,6 +59,8 @@ ChunkEvaluator
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_EditDistance:
EditDistance EditDistance
------------ ------------
...@@ -40,6 +68,8 @@ EditDistance ...@@ -40,6 +68,8 @@ EditDistance
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_DetectionMAP:
DetectionMAP DetectionMAP
------------ ------------
...@@ -47,6 +77,8 @@ DetectionMAP ...@@ -47,6 +77,8 @@ DetectionMAP
:members: :members:
:noindex: :noindex:
.. _api_fluid_metrics_Auc:
Auc Auc
--- ---
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
==== ==========
nets fluid.nets
==== ==========
.. _api_fluid_nets_simple_img_conv_pool:
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
...@@ -11,18 +13,24 @@ simple_img_conv_pool ...@@ -11,18 +13,24 @@ simple_img_conv_pool
.. autofunction:: paddle.fluid.nets.simple_img_conv_pool .. autofunction:: paddle.fluid.nets.simple_img_conv_pool
:noindex: :noindex:
.. _api_fluid_nets_sequence_conv_pool:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.fluid.nets.sequence_conv_pool .. autofunction:: paddle.fluid.nets.sequence_conv_pool
:noindex: :noindex:
.. _api_fluid_nets_glu:
glu glu
--- ---
.. autofunction:: paddle.fluid.nets.glu .. autofunction:: paddle.fluid.nets.glu
:noindex: :noindex:
.. _api_fluid_nets_scaled_dot_product_attention:
scaled_dot_product_attention scaled_dot_product_attention
---------------------------- ----------------------------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
========= ===============
optimizer fluid.optimizer
========= ===============
.. _api_fluid_optimizer_SGD:
SGD SGD
--- ---
...@@ -12,6 +14,8 @@ SGD ...@@ -12,6 +14,8 @@ SGD
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Momentum:
Momentum Momentum
-------- --------
...@@ -19,6 +23,8 @@ Momentum ...@@ -19,6 +23,8 @@ Momentum
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Adagrad:
Adagrad Adagrad
------- -------
...@@ -26,6 +32,8 @@ Adagrad ...@@ -26,6 +32,8 @@ Adagrad
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Adam:
Adam Adam
---- ----
...@@ -33,6 +41,8 @@ Adam ...@@ -33,6 +41,8 @@ Adam
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Adamax:
Adamax Adamax
------ ------
...@@ -40,6 +50,8 @@ Adamax ...@@ -40,6 +50,8 @@ Adamax
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_DecayedAdagrad:
DecayedAdagrad DecayedAdagrad
-------------- --------------
...@@ -47,6 +59,17 @@ DecayedAdagrad ...@@ -47,6 +59,17 @@ DecayedAdagrad
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Ftrl:
Ftrl
----
.. autoclass:: paddle.fluid.optimizer.Ftrl
:members:
:noindex:
.. _api_fluid_optimizer_SGDOptimizer:
SGDOptimizer SGDOptimizer
------------ ------------
...@@ -54,6 +77,8 @@ SGDOptimizer ...@@ -54,6 +77,8 @@ SGDOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_MomentumOptimizer:
MomentumOptimizer MomentumOptimizer
----------------- -----------------
...@@ -61,6 +86,8 @@ MomentumOptimizer ...@@ -61,6 +86,8 @@ MomentumOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_AdagradOptimizer:
AdagradOptimizer AdagradOptimizer
---------------- ----------------
...@@ -68,6 +95,8 @@ AdagradOptimizer ...@@ -68,6 +95,8 @@ AdagradOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_AdamOptimizer:
AdamOptimizer AdamOptimizer
------------- -------------
...@@ -75,6 +104,8 @@ AdamOptimizer ...@@ -75,6 +104,8 @@ AdamOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_AdamaxOptimizer:
AdamaxOptimizer AdamaxOptimizer
--------------- ---------------
...@@ -82,6 +113,8 @@ AdamaxOptimizer ...@@ -82,6 +113,8 @@ AdamaxOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_DecayedAdagradOptimizer:
DecayedAdagradOptimizer DecayedAdagradOptimizer
----------------------- -----------------------
...@@ -89,6 +122,8 @@ DecayedAdagradOptimizer ...@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_RMSPropOptimizer:
RMSPropOptimizer RMSPropOptimizer
---------------- ----------------
...@@ -96,6 +131,17 @@ RMSPropOptimizer ...@@ -96,6 +131,17 @@ RMSPropOptimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_FtrlOptimizer:
FtrlOptimizer
-------------
.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer
:members:
:noindex:
.. _api_fluid_optimizer_Adadelta:
Adadelta Adadelta
-------- --------
...@@ -103,6 +149,8 @@ Adadelta ...@@ -103,6 +149,8 @@ Adadelta
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_ModelAverage:
ModelAverage ModelAverage
------------ ------------
...@@ -110,6 +158,8 @@ ModelAverage ...@@ -110,6 +158,8 @@ ModelAverage
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_Optimizer:
Optimizer Optimizer
--------- ---------
...@@ -117,3 +167,12 @@ Optimizer ...@@ -117,3 +167,12 @@ Optimizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_optimizer_RMSPropOptimizer:
RMSPropOptimizer
----------------
.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
:members:
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
========== ================
param_attr fluid.param_attr
========== ================
.. _api_fluid_param_attr_ParamAttr:
ParamAttr ParamAttr
--------- ---------
...@@ -12,6 +14,8 @@ ParamAttr ...@@ -12,6 +14,8 @@ ParamAttr
:members: :members:
:noindex: :noindex:
.. _api_fluid_param_attr_WeightNormParamAttr:
WeightNormParamAttr WeightNormParamAttr
------------------- -------------------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
======== ==============
profiler fluid.profiler
======== ==============
.. _api_fluid_profiler_cuda_profiler:
cuda_profiler cuda_profiler
------------- -------------
...@@ -11,24 +13,32 @@ cuda_profiler ...@@ -11,24 +13,32 @@ cuda_profiler
.. autofunction:: paddle.fluid.profiler.cuda_profiler .. autofunction:: paddle.fluid.profiler.cuda_profiler
:noindex: :noindex:
.. _api_fluid_profiler_reset_profiler:
reset_profiler reset_profiler
-------------- --------------
.. autofunction:: paddle.fluid.profiler.reset_profiler .. autofunction:: paddle.fluid.profiler.reset_profiler
:noindex: :noindex:
.. _api_fluid_profiler_profiler:
profiler profiler
-------- --------
.. autofunction:: paddle.fluid.profiler.profiler .. autofunction:: paddle.fluid.profiler.profiler
:noindex: :noindex:
.. _api_fluid_profiler_start_profiler:
start_profiler start_profiler
-------------- --------------
.. autofunction:: paddle.fluid.profiler.start_profiler .. autofunction:: paddle.fluid.profiler.start_profiler
:noindex: :noindex:
.. _api_fluid_profiler_stop_profiler:
stop_profiler stop_profiler
------------- -------------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=====================
fluid.recordio_writer
=====================
.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
convert_reader_to_recordio_file
-------------------------------
.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
:noindex:
.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
convert_reader_to_recordio_files
--------------------------------
.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
=========== =================
regularizer fluid.regularizer
=========== =================
.. _api_fluid_regularizer_append_regularization_ops:
append_regularization_ops append_regularization_ops
------------------------- -------------------------
...@@ -11,12 +13,7 @@ append_regularization_ops ...@@ -11,12 +13,7 @@ append_regularization_ops
.. autofunction:: paddle.fluid.regularizer.append_regularization_ops .. autofunction:: paddle.fluid.regularizer.append_regularization_ops
:noindex: :noindex:
WeightDecayRegularizer .. _api_fluid_regularizer_L1Decay:
----------------------
.. autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
:members:
:noindex:
L1Decay L1Decay
------- -------
...@@ -25,6 +22,8 @@ L1Decay ...@@ -25,6 +22,8 @@ L1Decay
:members: :members:
:noindex: :noindex:
.. _api_fluid_regularizer_L2Decay:
L2Decay L2Decay
------- -------
...@@ -32,6 +31,8 @@ L2Decay ...@@ -32,6 +31,8 @@ L2Decay
:members: :members:
:noindex: :noindex:
.. _api_fluid_regularizer_L1DecayRegularizer:
L1DecayRegularizer L1DecayRegularizer
------------------ ------------------
...@@ -39,6 +40,8 @@ L1DecayRegularizer ...@@ -39,6 +40,8 @@ L1DecayRegularizer
:members: :members:
:noindex: :noindex:
.. _api_fluid_regularizer_L2DecayRegularizer:
L2DecayRegularizer L2DecayRegularizer
------------------ ------------------
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}` .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY! !DO NOT EDIT THIS FILE MANUALLY!
========== ================
transpiler fluid.transpiler
========== ================
.. _api_fluid_transpiler_DistributeTranspiler:
DistributeTranspiler DistributeTranspiler
-------------------- --------------------
...@@ -12,12 +14,7 @@ DistributeTranspiler ...@@ -12,12 +14,7 @@ DistributeTranspiler
:members: :members:
:noindex: :noindex:
InferenceTranspiler .. _api_fluid_transpiler_memory_optimize:
-------------------
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
:members:
:noindex:
memory_optimize memory_optimize
--------------- ---------------
...@@ -25,12 +22,16 @@ memory_optimize ...@@ -25,12 +22,16 @@ memory_optimize
.. autofunction:: paddle.fluid.transpiler.memory_optimize .. autofunction:: paddle.fluid.transpiler.memory_optimize
:noindex: :noindex:
.. _api_fluid_transpiler_release_memory:
release_memory release_memory
-------------- --------------
.. autofunction:: paddle.fluid.transpiler.release_memory .. autofunction:: paddle.fluid.transpiler.release_memory
:noindex: :noindex:
.. _api_fluid_transpiler_HashName:
HashName HashName
-------- --------
...@@ -38,9 +39,12 @@ HashName ...@@ -38,9 +39,12 @@ HashName
:members: :members:
:noindex: :noindex:
.. _api_fluid_transpiler_RoundRobin:
RoundRobin RoundRobin
---------- ----------
.. autoclass:: paddle.fluid.transpiler.RoundRobin .. autoclass:: paddle.fluid.transpiler.RoundRobin
:members: :members:
:noindex: :noindex:
...@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows: ...@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
## Slicing of LoD Tensors ## Slicing of LoD Tensors
When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**. When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
For example, the <2>-slice of above example is For example, the <2>-slice of above example is
...@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is ...@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
10 12 10 12
|| ||
``` ```
## Length Representation vs Offset Representation
The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API.
Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
```Python
# length representation of lod called recursive_sequence_lengths
recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
# Create a LoDTensor that has the above recursive_sequence_lengths info.
# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
tensor = fluid.LoDTensor(lod)
# Set/Change the recursive_sequence_lengths info of LoDTensor
tensor.set_recursive_sequence_lengths([[3, 1, 2]])
# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted
# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
new_recursive_seq_lens = tensor.recursive_sequence_lengths()
```
# Python Data Feeding
In the former implementation of Paddle Fluid, there are two ways to feed data:
- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
## Design of LoDTensorBlockingQueue
`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
```C++
class LoDTensorBlockingQueueHolder;
class LoDTensorBlockingQueue {
friend class LoDTensorBlockingQueueHolder;
private:
// `LoDTensorBlockingQueue` can only be constructed by
// `LoDTensorBlockingQueueHolder::InitOnce()`
LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
public:
size_t Size() const { return queue_.Size(); } // Get the current size of the queue
size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
void Close() { return queue_.Close(); }
bool IsClosed() const { return queue_.IsClosed(); }
// Block if Size() == Cap()
// Return false only when queue_.IsClosed() == true
bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
// Block if Size() == 0.
// *Success == false when queue_.IsClosed() == true
std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
private:
// Use reader::BlockingQueue as the inner data structure
BlockingQueue<std::vector<framework::LoDTensor>> queue_;
std::vector<framework::DDim> dims_;
};
class LoDTensorBlockingQueueHolder {
public:
// Call the constructor of `LoDTensorBlockingQueue` to create queue_
// `InitOnce` can only called once, otherwise an exception would raise
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
PADDLE_ENFORCE(queue_ == nullptr);
queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
}
const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
```
There are some major things that must be concerned:
- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
## Release of the GIL in pybind
`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
## Design of PyReader
`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
```C++
class PyReader : public ReaderBase {
public:
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
void ReInit() override { return; }
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
```
## Design of CreatePyReaderOp
`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
```C++
class CreatePyReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) return;
const std::string& queue_name = Input("blocking_queue");
auto* queue_holder_var = scope.FindVar(queue_name);
PADDLE_ENFORCE(queue_holder_var != nullptr);
auto* queue_holder = queue_holder_var
->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
out->Reset(new PyReader(queue_holder->GetQueue()));
}
};
```
## Design of Python codes
The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
```Python
def py_reader(capacity, shapes):
queue_name = unique_name.generate("lod_tensor_blocking_queue")
var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
out = create_var()
create_py_reader_op_with_queue_name(
inputs={'blocking_queue': queue_name},
outputs={'Out':[out]})
return out, feed_queue
```
...@@ -74,10 +74,10 @@ void OperatorWithKernel::Run( ...@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
auto kernel_type_for_var = this->GetKernelTypeForVar(...); auto kernel_type_for_var = this->GetKernelTypeForVar(...);
if (kernel_type_for_var.place_ != expected_kernel_key.place_) { if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
auto* trans_var = new_scope.Var(var_name); auto* trans_var = new_scope.Var(var_name);
auto* out = DataTransform(expected_kernel_key, auto* out = TransformData(expected_kernel_key,
kernel_type_for_var, kernel_type_for_var,
*tensor_in); *tensor_in);
CopyVariableWithTensor(...); SetTensorToVariable(...);
} }
} }
......
## 堆内存分析和优化 # 堆内存分析和优化
计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
...@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P ...@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。 对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
## 使用流程 ## 环境
#### 环境
本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
#### 使用流程 ## 使用流程
- 安装google-perftools - 安装google-perftools
......
# 如何使用timeline工具做性能分析
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
**提示:**
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
...
```
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
![chrome tracing](./tracing.jpeg)
1. 结果如下图所示,可以放到来查看timetime的细节信息。
![chrome timeline](./timeline.jpeg)
...@@ -19,6 +19,9 @@ endif(APPLE) ...@@ -19,6 +19,9 @@ endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api) set(inference_deps paddle_inference_api paddle_fluid_api)
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
endif()
function(inference_api_test TARGET_NAME) function(inference_api_test TARGET_NAME)
if (WITH_TESTING) if (WITH_TESTING)
...@@ -43,6 +46,10 @@ cc_library(paddle_inference_api ...@@ -43,6 +46,10 @@ cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_library(paddle_inference_api_shared SHARED
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS test_paddle_inference_api.cc SRCS test_paddle_inference_api.cc
DEPS paddle_inference_api) DEPS paddle_inference_api)
...@@ -50,17 +57,30 @@ cc_test(test_paddle_inference_api ...@@ -50,17 +57,30 @@ cc_test(test_paddle_inference_api
inference_api_test(test_paddle_inference_api_impl inference_api_test(test_paddle_inference_api_impl
ARGS test_word2vec test_image_classification) ARGS test_word2vec test_image_classification)
if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
endif()
if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's, # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so. # compile the libinference_anakin_api.a and compile with anakin.so.
nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc) nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common) target_link_libraries(inference_anakin_api anakin anakin_saber_common)
cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
if (WITH_TESTING)
cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api) DEPS inference_anakin_api)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
......
...@@ -14,3 +14,48 @@ ...@@ -14,3 +14,48 @@
# #
inference_api_test(simple_on_word2vec ARGS test_word2vec) inference_api_test(simple_on_word2vec ARGS test_word2vec)
option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
if(NOT WITH_INFERENCE_DEMO)
return()
endif()
set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
function(inference_download_test_demo TARGET)
if (NOT WITH_TESTING)
return()
endif()
set(options "")
set(oneValueArgs URL)
set(multiValueArgs SRCS)
cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
message(STATUS "inference demo ${test_dir}")
if(NOT EXISTS "${test_dir}")
message(STATUS "Download ${TARGET} model from ${tests_URL}")
execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
endif()
cc_test(${TARGET} SRCS "${tests_SRCS}"
DEPS paddle_inference_api paddle_fluid
ARGS --data=${test_dir}/data.txt
--modeldir=${test_dir}/model
--refer=${test_dir}/result.txt)
endfunction()
# disable mobilenet test
#inference_download_test_demo(mobilenet_inference_demo
# SRCS vis_demo.cc
# URL ${URL_ROOT}mobilenet.tar.gz)
inference_download_test_demo(se_resnext50_inference_demo
SRCS vis_demo.cc
URL ${URL_ROOT}se_resnext50.tar.gz)
inference_download_test_demo(ocr_inference_demo
SRCS vis_demo.cc
URL ${URL_ROOT}ocr.tar.gz)
# Infernce Demos
Input data format:
- Each line contains a single record
- Each record's format is
```
<space splitted floats as data>\t<space splitted ints as shape>
```
Follow the C++ codes in `vis_demo.cc`.
## MobileNet
To execute the demo, simply run
```sh
./mobilenet_inference_demo --modeldir <model> --data <datafile>
```
## SE-ResNeXt-50
To execute the demo, simply run
```sh
./se_resnext50_inference_demo --modeldir <model> --data <datafile>
```
## OCR
To execute the demo, simply run
```sh
./ocr_inference_demo --modeldir <model> --data <datafile>
```
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <thread> #include <thread>
#include "paddle/contrib/inference/paddle_inference_api.h" #include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle { namespace paddle {
namespace demo { namespace demo {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
namespace demo {
static void split(const std::string& str,
char sep,
std::vector<std::string>* pieces) {
pieces->clear();
if (str.empty()) {
return;
}
size_t pos = 0;
size_t next = str.find(sep, pos);
while (next != std::string::npos) {
pieces->push_back(str.substr(pos, next - pos));
pos = next + 1;
next = str.find(sep, pos);
}
if (!str.substr(pos).empty()) {
pieces->push_back(str.substr(pos));
}
}
/*
* Get a summary of a PaddleTensor content.
*/
static std::string SummaryTensor(const PaddleTensor& tensor) {
std::stringstream ss;
int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
ss << "data[:10]\t";
switch (tensor.dtype) {
case PaddleDType::INT64: {
for (int i = 0; i < std::min(num_elems, 10); i++) {
ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
}
break;
}
case PaddleDType::FLOAT32:
for (int i = 0; i < std::min(num_elems, 10); i++) {
ss << static_cast<float*>(tensor.data.data())[i] << " ";
}
break;
}
return ss.str();
}
} // namespace demo
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains demo for mobilenet, se-resnext50 and ocr.
*/
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/contrib/inference/demo/utils.h"
#include "paddle/contrib/inference/paddle_inference_api.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
#endif
namespace paddle {
namespace demo {
DEFINE_string(modeldir, "", "Directory of the inference model.");
DEFINE_string(refer, "", "path to reference result for comparison.");
DEFINE_string(
data,
"",
"path of data; each line is a record, format is "
"'<space splitted floats as data>\t<space splitted ints as shape'");
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
void split(const std::string& str, char sep, std::vector<std::string>* pieces);
Record ProcessALine(const std::string& line) {
LOG(INFO) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto& d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto& s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
LOG(INFO) << "data size " << record.data.size();
LOG(INFO) << "data shape size " << record.shape.size();
return record;
}
void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
std::string line;
std::ifstream file(referfile);
std::getline(file, line);
auto refer = ProcessALine(line);
file.close();
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
LOG(INFO) << "predictor output numel " << numel;
LOG(INFO) << "reference output numel " << refer.data.size();
EXPECT_EQ(numel, refer.data.size());
switch (output.dtype) {
case PaddleDType::INT64: {
for (size_t i = 0; i < numel; ++i) {
EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
}
break;
}
case PaddleDType::FLOAT32:
for (size_t i = 0; i < numel; ++i) {
EXPECT_NEAR(
static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
}
break;
}
}
/*
* Use the native fluid engine to inference the demo.
*/
void Main(bool use_gpu) {
NativeConfig config;
config.param_file = FLAGS_modeldir + "/__params__";
config.prog_file = FLAGS_modeldir + "/__model__";
config.use_gpu = use_gpu;
config.device = 0;
#ifdef PADDLE_WITH_CUDA
config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
#endif
LOG(INFO) << "init predictor";
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
LOG(INFO) << "begin to process data";
// Just a single batch of data.
std::string line;
std::ifstream file(FLAGS_data);
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input{
.name = "xx",
.shape = record.shape,
.data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
.dtype = PaddleDType::FLOAT32};
LOG(INFO) << "run executor";
std::vector<PaddleTensor> output;
predictor->Run({input}, &output);
LOG(INFO) << "output.size " << output.size();
auto& tensor = output.front();
LOG(INFO) << "output: " << SummaryTensor(tensor);
// compare with reference result
CheckOutput(FLAGS_refer, tensor);
}
TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
#ifdef PADDLE_WITH_CUDA
TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
#endif
} // namespace demo
} // namespace paddle
# Inference High-level APIs # Inference High-level APIs
This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application. This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed. The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
## PaddleTensor ## PaddleTensor
We provide the `PaddleTensor` data structure is to give a general tensor interface. We provide the `PaddleTensor` data structure to give a general tensor interface.
The definition is The definition is
...@@ -17,18 +17,19 @@ struct PaddleTensor { ...@@ -17,18 +17,19 @@ struct PaddleTensor {
}; };
``` ```
The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type.
The `name` field is used to specify the name of input variable, The `name` field is used to specify the name of an input variable,
that is important when there are multiple inputs and need to distiuish which variable to set. that is important when there are multiple inputs and need to distinguish which variable to set.
## engine ## engine
The inference APIs has two different underlying implementation, currently there are two valid engines: The inference APIs has two different underlying engines
- the native engine, which is consists of the native operators and framework, - the native engine, which is consists of the native operators and framework,
- the Anakin engine, which is a Anakin library embeded. - the Anakin engine, which has an Anakin library embedded.
The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, The native engine takes a native Paddle model as input, and supports any model that trained by Paddle,
but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported. the Anakin engine is faster for some model,
but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
```c++ ```c++
enum class PaddleEngineKind { enum class PaddleEngineKind {
...@@ -38,10 +39,10 @@ enum class PaddleEngineKind { ...@@ -38,10 +39,10 @@ enum class PaddleEngineKind {
``` ```
## PaddlePredictor and how to create one ## PaddlePredictor and how to create one
The main interface is `PaddlePredictor`, there are following methods The main interface is `PaddlePredictor,` there are following methods
- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)` - `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
- take inputs and output `output_data` - take inputs and output `output_data.`
- `Clone` to clone a predictor from an existing one, with model parameter shared. - `Clone` to clone a predictor from an existing one, with model parameter shared.
There is a factory method to help create a predictor, and the user takes the ownership of this object. There is a factory method to help create a predictor, and the user takes the ownership of this object.
...@@ -51,9 +52,9 @@ template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative> ...@@ -51,9 +52,9 @@ template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
``` ```
By specifying the engine kind and config, one can get an specific implementation. By specifying the engine kind and config, one can get a specific implementation.
## Reference ## Reference
- [paddle_inference_api.h](./paddle_inference_api.h) - [paddle_inference_api.h](./paddle_inference_api.h)
- [demos](./demo) - [some demos](./demo)
# Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
预测库包含:
- 头文件 `paddle_inference_api.h` 定义了所有的接口
- 库文件`libpaddle_fluid.so``libpaddle_fluid.a`
- 库文件 `libpaddle_inference_api.so``libpaddle_inference_api.a`
下面是详细的一些 API 概念介绍
## PaddleTensor
PaddleTensor 定义了预测最基本的输入输出的数据格式,其定义是
```c++
struct PaddleTensor {
std::string name; // variable name.
std::vector<int> shape;
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
```
- `name` 用于指定输入数据对应的 模型中variable 的名字 (暂时没有用,但会在后续支持任意 target 时启用)
- `shape` 表示一个 Tensor 的 shape
- `data` 数据以连续内存的方式存储在`PaddleBuf` 中,`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存,详细可以参考头文件中相关定义。
- `dtype` 表示 Tensor 的数据类型
## engine
高层 API 底层有多种优化实现,我们称之为 engine,目前有三种 engine
- 原生 engine,由 paddle 原生的 forward operator 组成,可以天然支持所有paddle 训练出的模型,
- Anakin engine,封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ,在某些模型上性能不错,但只能接受自带模型格式,无法支持所有 paddle 模型,
- TensorRT mixed engine,用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ,支持所有paddle 模型,并自动切割部分计算子图到 TensorRT 上加速(WIP)
其实现为
```c++
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
};
```
## 预测部署过程
总体上分为以下步骤
1. 用合适的配置创建 `PaddlePredictor`
2. 创建输入用的 `PaddleTensor`,传入到 `PaddlePredictor`
3. 获取输出的 `PaddleTensor` ,将结果取出
下面完整演示一个简单的模型,部分细节代码隐去
```c++
#include "paddle_inference_api.h"
// 创建一个 config,并修改相关设置
paddle::NativeConfig config;
config.model_dir = "xxx";
config.use_gpu = false;
// 创建一个原生的 PaddlePredictor
auto predictor =
paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
// 创建输入 tensor
int64_t data[4] = {1, 2, 3, 4};
paddle::PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
// 创建输出 tensor,输出 tensor 的内存可以复用
std::vector<paddle::PaddleTensor> outputs;
// 执行预测
CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ...
```
编译时,联编 `libpaddle_fluid.a/.so``libpaddle_inference_api.a/.so` 便可。
## 详细代码参考
- [inference demos](./demo)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
...@@ -16,6 +16,19 @@ limitations under the License. */ ...@@ -16,6 +16,19 @@ limitations under the License. */
namespace paddle { namespace paddle {
int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::INT64:
return sizeof(int64_t);
default:
//
assert(false);
return -1;
}
}
PaddleBuf::PaddleBuf(PaddleBuf&& other) PaddleBuf::PaddleBuf(PaddleBuf&& other)
: data_(other.data_), : data_(other.data_),
length_(other.length_), length_(other.length_),
...@@ -62,4 +75,4 @@ void PaddleBuf::Free() { ...@@ -62,4 +75,4 @@ void PaddleBuf::Free() {
} }
} }
} // namespace paddle } // namespace paddle
\ No newline at end of file
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
/* /*
* This file contains the definition of a simple Inference API for Paddle. * This file contains the definition of a simple Inference API for Paddle.
* *
* ATTENTION: It requires some C++ features, for lower version C++ or C, we * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API. * might release another API.
*/ */
...@@ -73,12 +73,12 @@ struct PaddleTensor { ...@@ -73,12 +73,12 @@ struct PaddleTensor {
}; };
enum class PaddleEngineKind { enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility. kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference. kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
// TODO(Superjomn) support following engines latter. // TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference. // kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin. // kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
}; };
/* /*
...@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config { ...@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
int max_batch_size{-1}; int max_batch_size{-1};
}; };
struct TensorRTConfig : public NativeConfig {
// Determine whether a subgraph will be executed by TRT.
int min_subgraph_size{1};
};
// A factory to help create different predictors. // A factory to help create different predictors.
// //
// FOR EXTENSION DEVELOPER: // FOR EXTENSION DEVELOPER:
...@@ -140,4 +145,7 @@ struct AnakinConfig : public PaddlePredictor::Config { ...@@ -140,4 +145,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
// Similarly, each engine kind should map to a unique predictor implementation. // Similarly, each engine kind should map to a unique predictor implementation.
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative> template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
int PaddleDtypeSize(PaddleDType dtype);
} // namespace paddle } // namespace paddle
...@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init( ...@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
LOG(ERROR) << "fail to load inference model."; LOG(ERROR) << "fail to load inference model.";
return false; return false;
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
executor_->CreateVariables( executor_->CreateVariables(
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
...@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
for (size_t i = 0; i < feed_target_names_.size(); ++i) { for (size_t i = 0; i < feed_target_names_.size(); ++i) {
VLOG(4) << "setting " << i << "-th target";
feed_targets[feed_target_names_[i]] = &feeds[i]; feed_targets[feed_target_names_[i]] = &feeds[i];
} }
// get fetch variable // get fetch variable
...@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
} }
// Run the inference program // Run the inference program
// if share variables, we need not create variables // if share variables, we need not create variables
VLOG(4) << "Run prepared context";
executor_->RunPreparedContext( executor_->RunPreparedContext(
ctx_.get(), ctx_.get(),
sub_scope_ != nullptr ? sub_scope_ : scope_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
&feed_targets, &feed_targets,
&fetch_targets, &fetch_targets,
false /* don't create variable eatch time */); false /* don't create variable eatch time */);
VLOG(4) << "Finish prepared context";
if (!GetFetch(fetchs, output_data)) { if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs"; LOG(ERROR) << "fail to get fetches";
return false; return false;
} }
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(3) << "predict cost: " << timer.toc() << "ms";
......
...@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
~NativePaddlePredictor() override; ~NativePaddlePredictor() override;
private: protected:
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<framework::LoDTensor> *feeds); std::vector<framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<framework::LoDTensor> &fetchs, bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/contrib/inference/paddle_inference_api.h"
#include "paddle/contrib/inference/paddle_inference_api_impl.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
using inference::analysis::Argument;
using inference::Singleton;
using inference::analysis::Analyzer;
using framework::proto::ProgramDesc;
class TensorRTSubgraphPredictor : public NativePaddlePredictor {
public:
explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
VLOG(3) << "Predictor::init()";
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
if (parent_scope) {
scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope());
} else {
paddle::framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
}
executor_.reset(new paddle::framework::Executor(place_));
// Initialize the inference program
if (!config_.model_dir.empty()) {
// Parameters are saved in separate files sited in
// the specified `dirname`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.model_dir);
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
return false;
}
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
*inference_program_->Proto() = *argument.transformed_program_desc;
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables";
executor_->CreateVariables(
*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
private:
TensorRTConfig config_;
};
template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const TensorRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) {
// 1. GPU memeroy
PADDLE_ENFORCE_GT(
config.fraction_of_gpu_memory,
0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory);
flags.push_back(flag);
VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags);
}
}
std::unique_ptr<PaddlePredictor> predictor(
new TensorRTSubgraphPredictor(config));
if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
->Init(nullptr)) {
return nullptr;
}
return std::move(predictor);
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/contrib/inference/paddle_inference_api.h"
namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) {
//# 1. Create PaddlePredictor with a config.
TensorRTConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
auto predictor =
CreatePaddlePredictor<TensorRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
for (int batch_id = 0; batch_id < 3; batch_id++) {
//# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor);
//# 3. Run
std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs));
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
}
}
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
} // namespace paddle
\ No newline at end of file
...@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: ", in.type().name()); "Input tensor type is not supported: ", in.type().name());
memory::data_type out_type = in_type; memory::data_type out_type = in_type;
memory::format in_format = auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
in_tz.size() == 2 ? memory::format::nc : in.format(); auto out_format =
memory::format out_format = platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
......
...@@ -61,6 +61,7 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { ...@@ -61,6 +61,7 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
if (iter != dict.end()) return iter->second; if (iter != dict.end()) return iter->second;
return MKLDNNDataType::data_undef; return MKLDNNDataType::data_undef;
} }
#endif #endif
void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
......
...@@ -18,17 +18,21 @@ limitations under the License. */ ...@@ -18,17 +18,21 @@ limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/data_type_transform.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static void PassTensorData(Tensor* from, Tensor* to) { static void PassTensorData(Tensor *from, Tensor *to) {
to->ShareDataWith(*from); to->ShareDataWith(*from);
*from = Tensor(); *from = Tensor();
} }
void DataTransform(const OpKernelType& expected_kernel_type, void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType& kernel_type_for_var, const OpKernelType &kernel_type_for_var,
const Tensor& input_tensor, Tensor* output_tensor) { const Tensor &input_tensor, Tensor *output_tensor) {
bool transformed = false; bool transformed = false;
Tensor in; Tensor in;
in.ShareDataWith(input_tensor); in.ShareDataWith(input_tensor);
...@@ -47,9 +51,13 @@ void DataTransform(const OpKernelType& expected_kernel_type, ...@@ -47,9 +51,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur // Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor); out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN); out.set_layout(DataLayout::kMKLDNN);
out.set_format(ToMKLDNNFormat(lin)); out.set_format(out_format);
#endif #endif
} else { } else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
...@@ -85,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type, ...@@ -85,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
output_tensor->ShareDataWith(in); output_tensor->ShareDataWith(in);
} }
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable* out_var) { Variable *out_var) {
if (in_var.IsType<LoDTensor>()) { if (in_var.IsType<LoDTensor>()) {
auto& in_lod_tensor = in_var.Get<LoDTensor>(); auto &in_lod_tensor = in_var.Get<LoDTensor>();
auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>(); auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
tran_lod_tensor->set_lod(in_lod_tensor.lod()); tran_lod_tensor->set_lod(in_lod_tensor.lod());
tran_lod_tensor->set_layout(in_lod_tensor.layout()); tran_lod_tensor->set_layout(in_lod_tensor.layout());
tran_lod_tensor->ShareDataWith(tensor); tran_lod_tensor->ShareDataWith(tensor);
} else if (in_var.IsType<SelectedRows>()) { } else if (in_var.IsType<SelectedRows>()) {
auto& in_selected_rows = in_var.Get<SelectedRows>(); auto &in_selected_rows = in_var.Get<SelectedRows>();
auto* trans_selected_rows = out_var->GetMutable<SelectedRows>(); auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
trans_selected_rows->set_height(in_selected_rows.height()); trans_selected_rows->set_height(in_selected_rows.height());
trans_selected_rows->set_rows(in_selected_rows.rows()); trans_selected_rows->set_rows(in_selected_rows.rows());
trans_selected_rows->mutable_value()->ShareDataWith(tensor); trans_selected_rows->mutable_value()->ShareDataWith(tensor);
......
...@@ -30,12 +30,15 @@ limitations under the License. */ ...@@ -30,12 +30,15 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void DataTransform(const OpKernelType& expected_kernel_type, void TransformData(const OpKernelType &expected_kernel_type,
const OpKernelType& kernel_type_for_var, const OpKernelType &kernel_type_for_var,
const Tensor& input_tensor, Tensor* out); const Tensor &input_tensor, Tensor *out);
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor, /**
Variable* out_var); * Set OutVar from InVar, except the tensor is shared with `tensor`
*/
void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
Variable *out_var);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -207,53 +207,56 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -207,53 +207,56 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
is_forwarding = false; is_forwarding = false;
} else { } else {
int op_dev_id = GetOpDeviceID(*op); int op_dev_id = GetOpDeviceID(*op);
if (op_dev_id == -1) { // var on all device if (op_dev_id != -1) { // This op only runs on one specific device.
CreateComputationalOps(&result, *op, places_.size());
} else {
CreateComputationalOp(&result, *op, op_dev_id); CreateComputationalOp(&result, *op, op_dev_id);
for (auto &var_name : op->OutputArgumentNames()) { for (auto &var_name : op->OutputArgumentNames()) {
var_name_on_devices_.emplace(var_name, op_dev_id); var_name_on_devices_.emplace(var_name, op_dev_id);
} }
} } else {
if (!is_forwarding && places_.size() > 1) { // This op runs on all devices, and its output may have parameter's
// Currently, we assume that once gradient is generated, it can be // gradients.
// broadcast, and each gradient is only broadcast once. CreateComputationalOps(&result, *op, places_.size());
if (static_cast<bool>(boost::get<int>(op->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) & if (!is_forwarding && places_.size() > 1) {
static_cast<int>(OpRole::kBackward))) { // Currently, we assume that once gradient is generated, it can be
try { // broadcast, and each gradient is only broadcast once.
auto backward_vars = if (static_cast<bool>(boost::get<int>(op->GetAttr(
boost::get<std::vector<std::string>>(op->GetNullableAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) &
OpProtoAndCheckerMaker::OpRoleVarAttrName())); static_cast<int>(OpRole::kBackward))) {
try {
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); auto backward_vars =
boost::get<std::vector<std::string>>(op->GetNullableAttr(
for (size_t i = 0; i < backward_vars.size(); i += 2) { OpProtoAndCheckerMaker::OpRoleVarAttrName()));
auto &p_name = backward_vars[i];
auto &g_name = backward_vars[i + 1]; PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
for (size_t i = 0; i < backward_vars.size(); i += 2) {
switch (strategy_.reduce_) { auto &p_name = backward_vars[i];
case BuildStrategy::ReduceStrategy::kReduce: auto &g_name = backward_vars[i + 1];
cur_device_id = GetAppropriateDeviceID({g_name}); VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices_.emplace(g_name, cur_device_id); switch (strategy_.reduce_) {
bcast_var_name_set[cur_device_id].emplace(p_name); case BuildStrategy::ReduceStrategy::kReduce:
break; cur_device_id = GetAppropriateDeviceID({g_name});
case BuildStrategy::ReduceStrategy::kAllReduce: CreateReduceOp(&result, g_name, cur_device_id);
if (IsSparseGradient(g_name)) { var_name_on_devices_.emplace(g_name, cur_device_id);
CreateReduceOp(&result, g_name, 0); bcast_var_name_set[cur_device_id].emplace(p_name);
CreateBroadcastOp(&result, g_name, 0); break;
} else { case BuildStrategy::ReduceStrategy::kAllReduce:
InsertAllReduceOp(&result, g_name); if (IsSparseGradient(g_name)) {
} CreateReduceOp(&result, g_name, 0);
break; CreateBroadcastOp(&result, g_name, 0);
default: } else {
LOG(FATAL) << "Unknown reduce strategy "; InsertAllReduceOp(&result, g_name);
break; }
break;
default:
LOG(FATAL) << "Unknown reduce strategy ";
break;
}
} }
} catch (boost::bad_get e) {
} }
} catch (boost::bad_get e) {
} }
} }
} }
...@@ -470,7 +473,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, ...@@ -470,7 +473,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
const OpDesc &op) const { const OpDesc &op) const {
int op_dev_id = -1; int op_dev_id = -1;
if (op.Type() == "split_byref") { if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
...@@ -483,6 +486,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, ...@@ -483,6 +486,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
} }
} else if (op.Type() == "concat") { } else if (op.Type() == "concat") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
for (auto &varname : op.OutputArgumentNames()) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else { } else {
PADDLE_ENFORCE( PADDLE_ENFORCE(
"the distribute training related op should be in [split_byref, " "the distribute training related op should be in [split_byref, "
......
...@@ -30,7 +30,7 @@ class SSAGraphBuilder { ...@@ -30,7 +30,7 @@ class SSAGraphBuilder {
SSAGraphBuilder() {} SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0; virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
virtual int GetVarDeviceID(const std::string &var_name) const { return -1; } virtual int GetVarDeviceID(const std::string &var_name) const = 0;
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include <string>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
...@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { ...@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
return graph; return graph;
} }
int GetVarDeviceID(const std::string& var_name) const override {
return builder_->GetVarDeviceID(var_name);
}
bool IsValidGraph(const SSAGraph* graph) const; bool IsValidGraph(const SSAGraph* graph) const;
private: private:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <iosfwd> #include <iosfwd>
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle { namespace paddle {
...@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { ...@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
return graph; return graph;
} }
int GetVarDeviceID(const std::string& var_name) const override {
return builder_->GetVarDeviceID(var_name);
}
private: private:
std::unique_ptr<SSAGraphPrinter> printer_; std::unique_ptr<SSAGraphPrinter> printer_;
std::unique_ptr<SSAGraphBuilder> builder_; std::unique_ptr<SSAGraphBuilder> builder_;
......
...@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { ...@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
// only print first ten elements // only print first ten elements
int64_t size = t.numel() < 10 ? t.numel() : 10; int64_t size = t.numel() < 10 ? t.numel() : 10;
for (int64_t i = 0; i < size; ++i) { for (int64_t i = 0; i < size; ++i) {
if (t.type().hash_code() == typeid(float).hash_code()) { if (t.type().hash_code() == typeid(float).hash_code()) { // NOLINT
os << t.data<float>()[i] << " "; os << t.data<float>()[i] << " ";
} else if (t.type().hash_code() == typeid(int64_t).hash_code()) { } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
os << t.data<int64_t>()[i] << " "; os << t.data<int64_t>()[i] << " ";
......
...@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { ...@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
return ret; return ret;
} }
inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) { inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
return (!platform::places_are_same_class(l.place_, r.place_)) || return (!platform::places_are_same_class(l.place_, r.place_)) ||
(l.data_type_ != r.data_type_) || (l.data_type_ != r.data_type_) ||
NeedTransformLayout(l.data_layout_, r.data_layout_); NeedTransformLayout(l.data_layout_, r.data_layout_);
......
...@@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
"There are no kernels which are registered in the %s operator.", type_); "There are no kernels which are registered in the %s operator.", type_);
} }
ExecutionContext ctx(*this, scope, *dev_ctx);
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
...@@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// Do selection // Do selection
// } // }
auto expected_kernel_key = this->GetExpectedKernelType(ctx); auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
...@@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString(expected_kernel_key)); KernelTypeToString(expected_kernel_key));
} }
// do data transform // do data transformScope &transfer_scope;
Scope& new_scope = scope.NewScope(); std::vector<std::string> transfered_inplace_vars;
auto* transfer_scope =
TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
std::vector<std::string> inplace_vars; // exec scope is the scope that kernel actually executed on.
for (auto& var_name_item : this->Inputs()) { const Scope& exec_scope =
for (auto& var_name : var_name_item.second) { (transfer_scope == nullptr ? scope : *transfer_scope);
auto* var = scope.FindVar(var_name);
if (var && VarIsTensor(var)) { if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
auto* tensor_in = GetTensorFromVar(var); dev_ctx = pool.Get(expected_kernel_key.place_);
if (tensor_in->IsInitialized()) {
auto kernel_type_for_var = this->GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(),
var_name) != out_var_names.end()) {
inplace_vars.push_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
auto* trans_var = new_scope.Var(var_name);
std::shared_ptr<Tensor> out(new Tensor);
DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
out.get());
CopyVariableWithTensor(*var, *(out.get()), trans_var);
}
}
}
}
} }
auto* new_dev_ctx = pool.Get(expected_kernel_key.place_); kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx));
kernel_iter->second->Compute(
ExecutionContext(*this, new_scope, *new_dev_ctx));
for (auto& var_name : inplace_vars) { if (!transfered_inplace_vars.empty()) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; // there is inplace variable has been transfered.
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
} }
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); dev_ctx->Wait();
} }
if (FLAGS_check_nan_inf) { if (FLAGS_check_nan_inf) {
for (auto& vname : OutputVars(true)) { for (auto& vname : OutputVars(true)) {
auto* var = new_scope.FindVar(vname); auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue; if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) { if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>()); CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
...@@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
} }
} }
void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
auto* transformed_tensor =
GetTensorFromVar(transfer_scope.FindVar(var_name));
original_tensor->ShareDataWith(*transformed_tensor);
}
}
Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const {
Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
// Only tensor can be tranfer to another device.
if (var == nullptr || !VarIsTensor(var)) {
continue;
}
auto* tensor_in = GetTensorFromVar(var);
if (!tensor_in->IsInitialized()) {
continue;
}
auto kernel_type_for_var = GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
}
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
out_var_names.end()) {
transfered_inplace_vars->emplace_back(var_name);
}
VLOG(3) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
if (new_scope == nullptr) {
new_scope = &scope.NewScope();
}
auto* trans_var = new_scope->Var(var_name);
Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var);
}
}
return new_scope;
}
proto::VarType::Type OperatorWithKernel::IndicateDataType( proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const { const ExecutionContext& ctx) const {
......
...@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase { ...@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
// same. // same.
proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
void RunImpl(const Scope& scope, const platform::Place& place) const final; void RunImpl(const Scope& scope, const platform::Place& place) const final;
/**
* Transfer data from scope to a transfered scope. If there is no data need to
* be tranfered, it returns nullptr.
*
* * transfered_inplace_vars is a output vector.
*/
Scope* TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const;
void TransferInplaceVarsBack(const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& exec_scope) const;
}; };
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
......
...@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor( ...@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor(
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const std::unordered_set<std::string> &vars) const { const std::unordered_set<std::string> &vars) const {
// the the initialize bcast, all vars would be bcast from device(0), otherwise // the the initializing bcast, all vars would be bcast from device(0),
// otherwise
// bcast from the specified device. // bcast from the specified device.
bool initialize = builder_.get() == nullptr ? true : false; bool initializing = builder_.get() == nullptr ? true : false;
for (auto &var : vars) { for (auto &var : vars) {
int var_dev_id = int var_dev_id =
builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
if (!initialize && var_dev_id == -1) continue; if (!initializing && var_dev_id == -1) continue;
framework::Variable *main_var = nullptr; framework::Variable *main_var = nullptr;
if (initialize) { if (initializing) {
main_var = member_->local_scopes_[0]->FindVar(var); main_var = member_->local_scopes_[0]->FindVar(var);
} else { } else {
main_var = member_->local_scopes_[var_dev_id]->FindVar(var); main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
...@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs(
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if ((initialize && i == 0) || (!initialize && i == var_dev_id)) { if ((initializing && i == 0) ||
(!initializing && static_cast<int>(i) == var_dev_id)) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
...@@ -181,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -181,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, if (initializing) {
nccl_ctx.comm_, nccl_ctx.stream()); platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
} else {
if (var_dev_id >= 0) {
platform::dynload::ncclBcast(buffers[i], numel, data_type,
var_dev_id, nccl_ctx.comm_,
nccl_ctx.stream());
}
}
} }
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
} }
......
...@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
if (platform::is_same_place(ctx_place, src_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (platform::is_same_place(ctx_place, dst_place)) {
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
} }
#endif #endif
} }
...@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) { Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx; const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) { if (platform::is_gpu_place(dst_place)) {
dev_ctx = pool.Get(src.place());
} else {
dev_ctx = pool.Get(dst_place); dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
} }
TensorCopy(src, dst_place, *dev_ctx, dst); TensorCopy(src, dst_place, *dev_ctx, dst);
} }
......
...@@ -23,10 +23,25 @@ limitations under the License. */ ...@@ -23,10 +23,25 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
// and dst_place are two different GPU, to ensure that the operation can
// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
// If ctx_place and src_place are the same, src_ctx.Wait() is added
// after memory::Copy; if ctx_place and dst_place are the same,
// src_ctx.Wait() is added before memory::Copy.
void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst); const platform::DeviceContext& ctx, Tensor* dst);
// NOTE(zcd): If the src.place() and dst_place are two different GPU,
// the copy operation is carried out on the dst_place's stream. This is
// very important, because TensorCopy is an async operator, and in most
// case, once this copy operator returns, dst is to be used in dst_place's
// stream, if this copy operation is carried out on the src_place's stream,
// when dst is used in dst_place's stream the copy operation may be
// not completed.
void TensorCopy(const Tensor& src, const platform::Place& dst_place, void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst); Tensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place, void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst); Tensor* dst);
......
...@@ -28,9 +28,10 @@ endif() ...@@ -28,9 +28,10 @@ endif()
if(WITH_TESTING) if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book) add_subdirectory(tests/book)
add_subdirectory(analysis)
endif() endif()
add_subdirectory(analysis)
if (TENSORRT_FOUND) if (TENSORRT_FOUND)
add_subdirectory(tensorrt) add_subdirectory(tensorrt)
endif() endif()
set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
fluid_to_data_flow_graph_pass.cc fluid_to_data_flow_graph_pass.cc
data_flow_graph_to_fluid_pass.cc data_flow_graph_to_fluid_pass.cc
tensorrt_subgraph_pass.cc
dfg_graphviz_draw_pass.cc dfg_graphviz_draw_pass.cc
DEPS framework_proto) tensorrt_subgraph_pass.cc
tensorrt_subgraph_node_mark_pass.cc
analyzer.cc
helper.cc
DEPS framework_proto proto_desc)
cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
function (inference_analysis_test TARGET) function (inference_analysis_test TARGET)
set(options "") if(WITH_TESTING)
set(oneValueArgs "") set(options "")
set(multiValueArgs SRCS) set(oneValueArgs "")
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(multiValueArgs SRCS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_test(${TARGET} cc_test(${TARGET}
SRCS "${analysis_test_SRCS}" SRCS "${analysis_test_SRCS}"
DEPS analysis DEPS analysis
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5) ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
...@@ -28,5 +32,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_ ...@@ -28,5 +32,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
# Inference Analysis
The `inference/analysis` module is used to analyze and optimize the inference program,
it references some philosophy from `LLVM/analysis`,
and make the various optimization features be pluggable and co-exist in a pipeline.
We borrowed some concepts from LLVM, such as
- [Pass](./pass.h)es to implement optimization that traverse the inference program,
- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
There are some other basic concepts here
- [Node](./node.h), the node in a `DataFlowGraph`,
- `Function`, the Operator in Fluid,
- `Value`, the Variable in Fluid;
- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
## How it works
The `inference/analysis` module make all the passes in a pipeline, and works in such way:
1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
The new optimization features can be added as an independent `Pass` and controlled by gflags,
each pass will generate unified debug information or visualization for better debugging.
## Supported Passes
### `FluidToDataFlowGraphPass`
Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes,
this should be the first pass of the pipeline.
### `DataFlowGraphToFluidPass`
Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
### `TensorRTSubgraphNodeMarkPass`
Mark the `Node` that are supported by TensorRT,
this pass will generate a visualization file which can be used for debugging.
### `TensorRTSubGraphPass`
Split the sub-graph that are can be accelerated by TensorRT.
### `DFG_GraphvizDrawPass`
This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
It can be used as a helper class that draws the modified graph after each pass.
## Utilities
There is some helper function/class for analysis.
- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration");
DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs.");
class DfgPassManagerImpl final : public DfgPassManager {
public:
DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs.
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [](const Node* node) {
if (!node->IsFunction()) return false;
return static_cast<const Function*>(node)->func_type() == "mul";
};
AddPass("tensorrt-subgraph-marker",
new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
}
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
}
std::string repr() const override { return "dfg-pass-manager"; }
std::string description() const override { return "DFG pass manager."; }
private:
void AddPass(const std::string& name, Pass* pass) {
LOG(INFO) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
}
// Add the graphviz debuger pass if the parent pass has one.
void AddGraphvizDebugerPass(Pass* pass) {
auto* debuger_pass = pass->CreateGraphvizDebugerPass();
if (debuger_pass) {
LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
Register(debuger_pass->repr(), debuger_pass);
}
}
};
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) {
for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument));
x->RunAll();
PADDLE_ENFORCE(x->Finalize());
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
/*
* This file contains Analyzer, an class that exposed as a library that analyze
* and optimize
* Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
* control whether
* an process is applied on the program.
*
* The processes are called Passes in analysis, the Passes are placed in a
* pipeline, the first
* Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
* a data flow
* graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
* graph to a
* Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
* which take a
* node or data flow graph as input.
*
* The Analyzer can be used in two methods, the first is a executable file which
* can be used to
* pre-process the inference model and can be controlled by passing difference
* command flags;
* the other way is to compose inside the inference API as a runtime pre-process
* phase in the
* inference service.
*/
#include <gflags/gflags.h>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
namespace inference {
namespace analysis {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root);
class Analyzer : public OrderedRegistry<PassManager> {
public:
// Register all the pass-managers.
Analyzer();
void Run(Argument* argument);
DISABLE_COPY_AND_ASSIGN(Analyzer);
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, main) {
Analyzer analyser;
analyser.Run(&argument);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -41,6 +41,9 @@ struct Argument { ...@@ -41,6 +41,9 @@ struct Argument {
// The original program desc. // The original program desc.
std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc; std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
// The processed program desc.
std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
}; };
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
......
...@@ -20,7 +20,7 @@ namespace paddle { ...@@ -20,7 +20,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
// It is a better idea that the inputs and outputs of this graph is set manully // It is a better idea that the inputs and outputs of this graph is set manually
// before, but there must be a Pass that helps to prune the unnecessary ops that // before, but there must be a Pass that helps to prune the unnecessary ops that
// do not contribute to the given targets, so in this pass, analysis and get the // do not contribute to the given targets, so in this pass, analysis and get the
// inputs and outputs is OK. // inputs and outputs is OK.
...@@ -50,6 +50,25 @@ void DataFlowGraph::Build() { ...@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
outputs.push_back(out); outputs.push_back(out);
} }
} }
Clean();
}
void DataFlowGraph::Clean() {
for (auto &node : nodes.nodes()) {
std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
node->inlinks.end());
std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
node->outlinks.end());
if (inlinks_set.size() < node->inlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
}
if (outlinks_set.size() < node->outlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
}
}
} }
std::string DataFlowGraph::DotString() const { std::string DataFlowGraph::DotString() const {
......
...@@ -47,6 +47,10 @@ struct DataFlowGraph { ...@@ -47,6 +47,10 @@ struct DataFlowGraph {
// Output a DOT graph file for debug. // Output a DOT graph file for debug.
std::string DotString() const; std::string DotString() const;
private:
// Remove duplicate edges and so on.
void Clean();
}; };
/* /*
...@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> { ...@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
// Extract the inputs and outputs of a graph. The inputs and outputs of a // Extract the inputs and outputs of a graph. The inputs and outputs of a
// sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph. // sub-graph.
std::pair< static std::pair<std::vector<Node *>, std::vector<Node *>>
std::vector<Node *>, ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<
Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
&graph) {
std::unordered_set<Node *> nodes(graph.begin(), graph.end()); std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs; std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs; std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) { for (auto &node : graph) {
for (auto *in : node->inlinks) { for (auto *in : node->inlinks) {
if (!nodes.count(in) && in->type() == Node::Type::kValue) { // The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in); inputs.insert(in);
} }
} }
......
...@@ -13,21 +13,35 @@ ...@@ -13,21 +13,35 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/framework/proto_desc.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using framework::proto::ProgramDesc;
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes);
bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument) ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
desc_ = argument->origin_program_desc.get(); PADDLE_ENFORCE(!argument->transformed_program_desc);
// Here some logic from program_desc.cc and will not add new interfaces into // The transformed_program_desc should inherit all the VarDesc and BlockDesc
// framework::ProgramDesc class, use some UT to assure the correctness. // from the original program desc. The operators of the main block(the first
auto* block = desc_->mutable_blocks()->Add(); // block) should rewritten by data flow graph.
block->set_idx(framework::kRootBlockIndex); argument->transformed_program_desc.reset(
block->set_parent_idx(framework::kNoneBlockIndex); new ProgramDesc(*argument->origin_program_desc));
argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
->clear_ops();
desc_ = argument->transformed_program_desc.get();
argument_ = argument;
return true; return true;
} }
...@@ -37,14 +51,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { ...@@ -37,14 +51,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
auto traits = GraphTraits<DataFlowGraph>(graph); auto traits = GraphTraits<DataFlowGraph>(graph);
for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) { for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
if (it->deleted()) continue; if (it->deleted()) continue;
switch (it->type()) { switch (it->type()) {
case Node::Type::kFunction: case Node::Type::kFunction: {
LOG(INFO) << "add function " << it->name(); LOG(INFO) << "add function " << it->repr();
AddFluidOp(&(*it)); AddFluidOp(&(*it));
break; } break;
case Node::Type::kFunctionBlock: case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << it->repr() << " , "
<< static_cast<FunctionBlock*>(&(*it))->subgraph.size();
AddEngineOp(&(*it)); AddEngineOp(&(*it));
break; } break;
default: default:
continue; continue;
} }
...@@ -52,12 +69,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { ...@@ -52,12 +69,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
} }
void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
LOG(INFO) << "processing func " << node->name();
auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc()); auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
// currently only the main block is analyzed. // currently only the main block is analyzed.
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops(); auto* op = main_block->add_ops();
LOG(INFO) << "to copy the op";
*op = *ori_op; // copy the attributes, by default, these will not be changed *op = *ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase. // by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt // The inputs and outputs of the existing ops are not changed by tensorrt
...@@ -65,11 +80,90 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { ...@@ -65,11 +80,90 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
// NOTE It might be changed by other passes in the long run. // NOTE It might be changed by other passes in the long run.
} }
void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
const framework::proto::BlockDesc& block) {
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
auto* func = static_cast<FunctionBlock*>(node);
// collect inputs
std::vector<std::string> io;
for (auto* x : func->inlinks) {
io.push_back(x->name());
}
desc.SetInput("Xs", io);
// collect outputs
io.clear();
for (auto* x : func->outlinks) {
io.push_back(x->name());
}
desc.SetOutput("Ys", io);
desc.SetType("tensorrt_engine");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "engine_unique_key",
"trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "max_workspace",
1024); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
node->SetPbMsg(desc.Proto()->SerializeAsString());
}
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes) {
std::vector<std::string> parameters;
for (const auto& node : nodes) {
if (!node->IsValue()) continue;
PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
framework::proto::VarDesc var;
var.ParseFromString(node->pb_msg());
if (var.persistable()) {
parameters.push_back(var.name());
}
}
return parameters;
}
void DataFlowGraphToFluidPass::AddEngineOp(Node* node) { void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
// auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
// auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
// auto* op = main_block->add_ops();
// TODO(Superjomn) Here need to expose some arguments for default setting. // TODO(Superjomn) Here need to expose some arguments for default setting.
PADDLE_ENFORCE(node->IsFunctionBlock());
auto* block_node = static_cast<FunctionBlock*>(node);
framework::proto::BlockDesc proto;
framework::BlockDesc block_desc(nullptr, &proto);
// copy ops.
for (auto* node : block_node->subgraph) {
auto* op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg());
}
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
op->ParseFromString(node->pb_msg());
}
namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
explicit DFG_DebuggerPass(const Config& config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
bool Finalize() override { return true; }
};
} // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger"));
} }
} // namespace analysis } // namespace analysis
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
...@@ -40,10 +41,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass { ...@@ -40,10 +41,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
return "Transform a DFG to a Fluid ProgramDesc"; return "Transform a DFG to a Fluid ProgramDesc";
} }
Pass *CreatePrinterPass(std::ostream &os, Pass *CreateGraphvizDebugerPass() const override;
const std::string &banner) const override {
return nullptr;
}
protected: protected:
// Add a Fluid Op into the ProgramDesc. // Add a Fluid Op into the ProgramDesc.
...@@ -53,6 +51,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass { ...@@ -53,6 +51,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
private: private:
framework::proto::ProgramDesc *desc_; framework::proto::ProgramDesc *desc_;
Argument *argument_;
}; };
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -18,12 +18,19 @@ namespace paddle { ...@@ -18,12 +18,19 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
int DFG_GraphvizDrawPass::counter_{0};
void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto content = Draw(graph); auto content = Draw(graph);
std::ofstream file(GenDotPath()); auto dot_path = GenDotPath();
std::ofstream file(dot_path);
file.write(content.c_str(), content.size()); file.write(content.c_str(), content.size());
file.close(); file.close();
LOG(INFO) << "draw dot to " << GenDotPath();
auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
std::string message;
LOG(INFO) << "draw to " << png_path;
ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
} }
std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
...@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { ...@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
if (!config_.display_deleted_node && node.deleted()) continue; if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) { for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue; if (!config_.display_deleted_node && in->deleted()) continue;
for (auto &in : node.inlinks) { dot.AddEdge(in->repr(), node.repr(), {});
dot.AddEdge(in->repr(), node.repr(), {});
}
} }
} }
return dot.Build(); return dot.Build();
......
...@@ -46,24 +46,29 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass { ...@@ -46,24 +46,29 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
const bool display_deleted_node; const bool display_deleted_node;
}; };
DFG_GraphvizDrawPass(const Config &config) : config_(config) {} explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
bool Initialize(Argument *argument) override { return true; } bool Initialize(Argument *argument) override { return true; }
void Run(DataFlowGraph *graph) override; void Run(DataFlowGraph *graph) override;
bool Finalize() override { return Pass::Finalize(); } bool Finalize() override { return true; }
std::string repr() const override { return "DFG graphviz drawer"; } std::string repr() const override { return "DFG graphviz drawer"; }
std::string description() const override { std::string description() const override {
return "Debug a DFG by draw with graphviz"; return "Debug a DFG by draw with graphviz";
} }
private: protected:
// A counter to add a number prefix to the debugger image output so that they
// will sort in the triggered order.
static int counter_;
// Path of the dot file to output. // Path of the dot file to output.
std::string GenDotPath() const { std::string GenDotPath() const {
return config_.dir + "/" + "graph_" + config_.id + ".dot"; return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
config_.id + ".dot";
} }
std::string Draw(DataFlowGraph *graph); virtual std::string Draw(DataFlowGraph *graph);
Config config_; Config config_;
}; };
......
...@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { ...@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
pass.Run(&dfg); pass.Run(&dfg);
// test content // test content
std::ifstream file("./graph_test.dot"); std::ifstream file("./0-graph_test.dot");
ASSERT_TRUE(file.is_open()); ASSERT_TRUE(file.is_open());
std::string line; std::string line;
...@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { ...@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
no++; no++;
} }
// DFG is sensitive to ProgramDesc, be careful to change the existing models. // DFG is sensitive to ProgramDesc, be careful to change the existing models.
ASSERT_EQ(no, 112); ASSERT_EQ(no, 82);
} }
} // namespace analysis } // namespace analysis
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
namespace paddle { namespace paddle {
...@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ...@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
return true; return true;
} }
bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); } bool FluidToDataFlowGraphPass::Finalize() { return true; }
void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE(graph); PADDLE_ENFORCE(graph);
...@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
auto *v = graph->nodes.Create(Node::Type::kValue); auto *v = graph->nodes.Create(Node::Type::kValue);
v->SetName(var.name()); v->SetName(var.name());
v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var))); v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id(); var2id[var.name()] = v->id();
} }
for (int i = 0; i < main_block.ops_size(); i++) { for (int i = 0; i < main_block.ops_size(); i++) {
...@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
// Link to the original protobuf message's memory, make it easier to // Link to the original protobuf message's memory, make it easier to
// generate from a data flow graph to fluid ProgramDesc. // generate from a data flow graph to fluid ProgramDesc.
o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op))); o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs // set inputs and outputs
// TODO(Superjomn) make sure the InputNames is the real variable name. // TODO(Superjomn) make sure the InputNames is the real variable name.
for (int j = 0; j < op.inputs_size(); j++) { for (int j = 0; j < op.inputs_size(); j++) {
...@@ -79,9 +84,20 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -79,9 +84,20 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
graph->Build(); graph->Build();
} }
Pass *FluidToDataFlowGraphPass::CreatePrinterPass( namespace {
std::ostream &os, const std::string &banner) const { class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
return nullptr; public:
using Config = DFG_GraphvizDrawPass::Config;
explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
bool Finalize() override { return true; }
};
}
Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
} }
} // namespace analysis } // namespace analysis
......
...@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { ...@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
return "transform a fluid ProgramDesc to a data flow graph."; return "transform a fluid ProgramDesc to a data flow graph.";
} }
Pass *CreatePrinterPass(std::ostream &os, Pass *CreateGraphvizDebugerPass() const override;
const std::string &banner) const override;
private: private:
framework::proto::ProgramDesc const *desc_; framework::proto::ProgramDesc const *desc_;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/framework/framework.pb.h"
namespace paddle {
namespace inference {
namespace analysis {
template <>
void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
const std::string &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRING);
attr->set_s(data);
}
template <>
void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
const int &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
const int64_t &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::LONG);
attr->set_l(data);
}
template <>
void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
const std::string &name,
const std::vector<std::string> &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRINGS);
for (const auto &s : data) {
attr->add_strings(s.c_str());
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -14,10 +14,12 @@ limitations under the License. */ ...@@ -14,10 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdio>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -26,6 +28,10 @@ namespace paddle { ...@@ -26,6 +28,10 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
template <typename T>
void SetAttr(framework::proto::OpDesc *op, const std::string &name,
const T &data);
template <typename Vec> template <typename Vec>
int AccuDims(Vec &&vec, int size) { int AccuDims(Vec &&vec, int size) {
int res = 1; int res = 1;
...@@ -93,7 +99,7 @@ template <typename T> ...@@ -93,7 +99,7 @@ template <typename T>
class OrderedRegistry { class OrderedRegistry {
public: public:
T *Register(const std::string &name, T *x) { T *Register(const std::string &name, T *x) {
PADDLE_ENFORCE(!dic_.count(name)); PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
dic_[name] = data_.size(); dic_[name] = data_.size();
data_.emplace_back(std::unique_ptr<T>(x)); data_.emplace_back(std::unique_ptr<T>(x));
return data_.back().get(); return data_.back().get();
...@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) { ...@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
return *var->GetMutable<T>(); return *var->GetMutable<T>();
} }
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -20,6 +20,17 @@ namespace paddle { ...@@ -20,6 +20,17 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
template <>
std::string &NodeAttr::As<std::string>() {
if (data_.empty()) {
type_hash_ = typeid(std::string).hash_code();
}
PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
return data_;
}
std::string &NodeAttr::String() { return As<std::string>(); }
std::vector<Dot::Attr> Value::dot_attrs() const { std::vector<Dot::Attr> Value::dot_attrs() const {
return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"), return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
Dot::Attr("shape", "box"), Dot::Attr("shape", "box"),
......
...@@ -35,6 +35,44 @@ namespace analysis { ...@@ -35,6 +35,44 @@ namespace analysis {
class NodeMap; class NodeMap;
// A helper class to maintain the status from Pass.
struct NodeAttr {
// NOTE T should be a primary type or a struct combined by several primary
// types.
// NOTE the STL containers should not use here.
// Some usages
// Attr attr;
// attr.Bool() = true;
bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); }
std::string &String();
private:
template <typename T>
T &As() {
// init storage in the first usage.
if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T);
type_hash_ = typeid(T).hash_code();
data_.resize(sizeof(T));
}
PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_hash_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
}
private:
std::string data_;
size_t type_hash_{std::numeric_limits<size_t>::max()};
};
/* /*
* Node Representation. * Node Representation.
* *
...@@ -50,8 +88,6 @@ class Node { ...@@ -50,8 +88,6 @@ class Node {
Node() = default; Node() = default;
struct Attr;
// Cast to a subclass type, Function for example. // Cast to a subclass type, Function for example.
template <typename Subclass> template <typename Subclass>
Subclass &As() { Subclass &As() {
...@@ -71,7 +107,7 @@ class Node { ...@@ -71,7 +107,7 @@ class Node {
// Get an additional attribute and convert it to T data type. NOTE this will // Get an additional attribute and convert it to T data type. NOTE this will
// silently create a new attribute if not exists. // silently create a new attribute if not exists.
Attr &attr(const std::string &name) const { return attrs_[name]; } NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
int id() const { return id_; } int id() const { return id_; }
...@@ -80,6 +116,9 @@ class Node { ...@@ -80,6 +116,9 @@ class Node {
void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; } void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
void *pb_desc() const { return attr("pb_desc").Pointer(); } void *pb_desc() const { return attr("pb_desc").Pointer(); }
void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
const std::string &pb_msg() const { return attr("pb_msg").String(); }
void SetDeleted() { deleted_ = true; } void SetDeleted() { deleted_ = true; }
bool deleted() const { return deleted_; } bool deleted() const { return deleted_; }
...@@ -94,43 +133,6 @@ class Node { ...@@ -94,43 +133,6 @@ class Node {
// Output links. // Output links.
std::vector<Node *> outlinks; std::vector<Node *> outlinks;
// A helper class to maintain the status from Pass.
struct Attr {
// NOTE T should be a primary type or a struct combined by several primary
// types.
// NOTE the STL containers should not use here.
// Some usages
// Attr attr;
// attr.Bool() = true;
bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); }
private:
template <typename T>
T &As() {
// init storage in the first usage.
if (data_.empty()) {
VLOG(4) << "resize data to " << sizeof(T);
type_hash_ = typeid(T).hash_code();
data_.resize(sizeof(T));
}
PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_hash_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
}
private:
std::string data_;
size_t type_hash_{std::numeric_limits<size_t>::max()};
};
// Type checks. // Type checks.
bool IsFunction() const { return type_ == Node::Type::kFunction; } bool IsFunction() const { return type_ == Node::Type::kFunction; }
bool IsValue() const { return type_ == Node::Type::kValue; } bool IsValue() const { return type_ == Node::Type::kValue; }
...@@ -150,7 +152,7 @@ class Node { ...@@ -150,7 +152,7 @@ class Node {
Type type_{Type::kNone}; Type type_{Type::kNone};
// Mark this node is deleted by some pass. // Mark this node is deleted by some pass.
bool deleted_{false}; bool deleted_{false};
mutable std::unordered_map<std::string, Attr> attrs_; mutable std::unordered_map<std::string, NodeAttr> attrs_;
}; };
class Function; class Function;
...@@ -213,6 +215,10 @@ class Function : public Node { ...@@ -213,6 +215,10 @@ class Function : public Node {
struct FunctionBlock : public Node { struct FunctionBlock : public Node {
std::string repr() const override { return "block-" + std::to_string(id()); } std::string repr() const override { return "block-" + std::to_string(id()); }
std::vector<Node *> subgraph; std::vector<Node *> subgraph;
protected:
FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
friend class NodeMap;
}; };
class NodeMap { class NodeMap {
...@@ -227,7 +233,7 @@ class NodeMap { ...@@ -227,7 +233,7 @@ class NodeMap {
void Delete(size_t id); void Delete(size_t id);
const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; } const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
size_t size() const { return nodes_.size(); } size_t size() const { return nodes_.size(); }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file contains all the flags that declared in Node::Attr.
*
* The Node::Attr is designed to share information between different passes, one
* can get other's attributes in a Node by the flags in this file.
*/
#pragma once
namespace paddle {
namespace inference {
namespace analysis {
#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
DECLARE_NODE_ATTR(supported_by_tensorrt) // bool
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -60,6 +60,9 @@ class Pass { ...@@ -60,6 +60,9 @@ class Pass {
return nullptr; return nullptr;
} }
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
// Run on a single Node. // Run on a single Node.
virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
// Run on a single Function. // Run on a single Function.
......
...@@ -19,6 +19,18 @@ namespace paddle { ...@@ -19,6 +19,18 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
bool PassManager::Initialize(Argument* argument) {
argument_ = argument;
for (auto& pass : data_) {
LOG(INFO) << "Initializing pass " << pass->repr();
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
}
}
return true;
}
void DfgPassManager::RunAll() { void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
for (auto& pass : data_) { for (auto& pass : data_) {
......
...@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> { ...@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
// globally shared, so pass them as the arguemnts for all the pass managers. // globally shared, so pass them as the arguemnts for all the pass managers.
virtual bool Initialize(const Argument& argument) { return false; } virtual bool Initialize(const Argument& argument) { return false; }
virtual bool Initialize(Argument* argument) { virtual bool Initialize(Argument* argument);
argument_ = argument;
for (auto& pass : data_) {
LOG(INFO) << "Initializing pass " << pass->repr();
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
}
}
return true;
}
// Call all the passes' Finalize methods. // Call all the passes' Finalize methods.
virtual bool Finalize() { virtual bool Finalize() {
......
...@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h" #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) { ...@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
ASSERT_TRUE(&argument);
ASSERT_TRUE(manager.Initialize(&argument)); ASSERT_TRUE(manager.Initialize(&argument));
manager.RunAll(); manager.RunAll();
} }
......
...@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } ...@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
void SubGraphFuse::ReplaceNodesWithSubGraphs() { void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
// as deleted. 3. Replace the deleted node with the new Block Node. // as deleted. 3. Replace the deleted node with the new Block Node.
auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock); auto *block_node = static_cast<FunctionBlock *>(
graph_->nodes.Create(Node::Type::kFunctionBlock));
auto io = ExtractInputAndOutputOfSubGraph(subgraph); auto io = ExtractInputAndOutputOfSubGraph(subgraph);
block_node->inlinks = std::move(io.first); block_node->inlinks = std::move(io.first);
block_node->outlinks = std::move(io.second); block_node->outlinks = std::move(io.second);
...@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { ...@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
// TODO(Superjomn) need a unified mechanism to treat deleted node in each // TODO(Superjomn) need a unified mechanism to treat deleted node in each
// pass. // pass.
node->SetDeleted(); node->SetDeleted();
block_node->subgraph.push_back(node);
} }
std::unordered_map<Node *, Node *> // Change all the sub-graph's inputs and outputs corresponding inlink and
delelte_node_map; // deleted node to BlockNode // outlink to this sub-graph node.
for (auto *n : block_node->inlinks) { auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
n->inlinks.clear(); for (auto *&n : nodes) {
} if (subgraph_uniq.count(n)) {
for (auto *n : block_node->outlinks) { n = block_node;
n->outlinks.clear(); }
} }
for (auto *n : block_node->inlinks) { std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
n->outlinks.push_back(block_node); nodes.assign(uniq.begin(), uniq.end());
};
for (auto *i : block_node->inlinks) {
inlink_or_outlink_cleaner(i->outlinks);
} }
for (auto *n : block_node->outlinks) { for (auto *&o : block_node->outlinks) {
n->inlinks.push_back(n); inlink_or_outlink_cleaner(o->inlinks);
} }
} }
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
for (auto &node : graph->nodes.nodes()) {
node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
}
}
class DfgDebuggerPass : public DFG_GraphvizDrawPass {
public:
explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override {
return "tensorrt-subgraph-node-mark-debugger";
}
bool Finalize() override { return true; }
protected:
std::string Draw(DataFlowGraph *graph) override {
Dot dot;
// Add nodes
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (config_.display_deleted_node || !node.deleted()) {
auto dot_attr = node.dot_attrs();
if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
dot_attr.assign(
{Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
}
dot.AddNode(node.repr(), dot_attr);
}
}
// Add edges
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {});
}
}
return dot.Build();
}
};
Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
DFG_GraphvizDrawPass::Config config(
FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
return new DfgDebuggerPass(config);
}
bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
* that supported by TensorRT engine.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Mark the operators that TensorRT engine supports.
*/
class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
public:
using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
: teller_(teller) {}
bool Initialize(Argument* argument) override { return true; }
// This class get a sub-graph as input and determine whether to transform this
// sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override;
std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
std::string description() const override {
return "tensorrt sub-graph mark pass";
}
Pass* CreateGraphvizDebugerPass() const override;
bool Finalize() override;
private:
teller_t teller_;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/node_attr_flags.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
// init
FluidToDataFlowGraphPass pass;
ASSERT_TRUE(pass.Initialize(&argument));
argument.main_dfg.reset(new DataFlowGraph);
pass.Run(argument.main_dfg.get());
TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
return node->IsFunction() &&
static_cast<const Function*>(node)->func_type() == "mul";
};
TensorRTSubgraphNodeMarkPass pass1(teller);
ASSERT_TRUE(pass1.Initialize(&argument));
pass1.Run(argument.main_dfg.get());
int counter{0};
for (auto& node : argument.main_dfg->nodes.nodes()) {
counter += node->attr(ATTR_supported_by_tensorrt).Bool();
}
LOG(INFO) << counter << " nodes marked";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( ...@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
: node_inside_subgraph_teller_(teller) {} : node_inside_subgraph_teller_(teller) {}
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_); SubGraphFuse(graph, node_inside_subgraph_teller_)();
} }
} // namespace analysis } // namespace analysis
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/subgraph_splitter.h" #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
...@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
// Tell whether to transform a sub-graph into TensorRT. // Tell whether to transform a sub-graph into TensorRT.
using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller; using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
bool Initialize(Argument* argument) override { return true; } bool Initialize(Argument* argument) override { return true; }
...@@ -38,10 +39,15 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { ...@@ -38,10 +39,15 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
// sub-graph into TensorRT. // sub-graph into TensorRT.
void Run(DataFlowGraph* graph) override; void Run(DataFlowGraph* graph) override;
bool Finalize() override { return true; }
std::string repr() const override { return "tensorrt-sub-graph"; }
std::string description() const override { return "tensorrt sub graph pass"; }
private: private:
NodeInsideSubgraphTeller node_inside_subgraph_teller_; NodeInsideSubgraphTeller node_inside_subgraph_teller_;
}; };
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // paddle } // namespace paddle
...@@ -23,49 +23,48 @@ namespace paddle { ...@@ -23,49 +23,48 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
DEFINE_string(model_dir, "", "inference test model dir"); DEFINE_string(dot_dir, "./", "");
TEST(TensorRTSubGraph, single_pass) { TEST_F(DFG_Tester, tensorrt_single_pass) {
auto desc = LoadProgramDesc(); std::unordered_set<std::string> teller_set(
auto dfg = ProgramDescToDFG(desc); {"elementwise_add", "mul", "sigmoid"});
SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
if (node->type() != Node::Type::kFunction) return false; if (node->type() != Node::Type::kFunction) return false;
const auto* func = static_cast<const Function*>(node); const auto* func = static_cast<const Function*>(node);
if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || if (teller_set.count(func->func_type())) return true;
func->func_type() == "conv2d" || func->func_type() == "mul" ||
func->func_type() == "sigmoid" || func->func_type() == "softmax") {
LOG(INFO) << "sub-graph marked " << node->repr();
return true;
}
return false; return false;
}; };
DFG_GraphvizDrawPass::Config config{"./", "test"}; LOG(INFO) << "init";
DFG_GraphvizDrawPass dfg_pass(config); DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
dfg_pass.Initialize(); DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
DFG_GraphvizDrawPass dfg_pass1(config);
dfg_pass1.Initialize();
dfg_pass.Run(&dfg);
DFG_GraphvizDrawPass dfg_pass(config);
DFG_GraphvizDrawPass dfg_pass1(config1);
FluidToDataFlowGraphPass pass0;
TensorRTSubGraphPass trt_pass(std::move(teller)); TensorRTSubGraphPass trt_pass(std::move(teller));
trt_pass.Initialize();
trt_pass.Run(&dfg); LOG(INFO) << "Initialize";
dfg_pass.Initialize(&argument);
dfg_pass1.Initialize(&argument);
pass0.Initialize(&argument);
trt_pass.Initialize(&argument);
dfg_pass1.Run(&dfg); LOG(INFO) << "Run";
argument.main_dfg.reset(new DataFlowGraph);
pass0.Run(argument.main_dfg.get());
dfg_pass.Run(argument.main_dfg.get());
trt_pass.Run(argument.main_dfg.get());
dfg_pass1.Run(argument.main_dfg.get());
// Check the TRT op's block desc // Check the TRT op's block desc
for (auto node : dfg.nodes.nodes()) { for (auto& node : argument.main_dfg->nodes.nodes()) {
if (node->IsFunctionBlock()) { if (node->IsFunctionBlock()) {
LOG(INFO) << "get function block";
} }
} }
} }
TEST(TensorRTSubGraph, pass_manager) {}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_service.h>
#include <omp.h> #include <omp.h>
#endif #endif
...@@ -164,7 +164,7 @@ TEST(inference, nlp) { ...@@ -164,7 +164,7 @@ TEST(inference, nlp) {
// only use 1 thread number per std::thread // only use 1 thread number per std::thread
omp_set_dynamic(0); omp_set_dynamic(0);
omp_set_num_threads(1); omp_set_num_threads(1);
mkl_set_num_threads(1); paddle::operators::math::SetNumThreads(1);
#endif #endif
double start_ms = 0, stop_ms = 0; double start_ms = 0, stop_ms = 0;
......
...@@ -20,6 +20,12 @@ limitations under the License. */ ...@@ -20,6 +20,12 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle { namespace paddle {
...@@ -41,6 +47,9 @@ template <> ...@@ -41,6 +47,9 @@ template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
return p; return p;
} }
...@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place); LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev); platform::SetDeviceId(cur_dev);
} }
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr; return ptr;
} }
...@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, ...@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace"; << " bytes in CUDAPinnedPlace";
} }
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr; return ptr;
} }
......
...@@ -195,7 +195,7 @@ if(WITH_DISTRIBUTE) ...@@ -195,7 +195,7 @@ if(WITH_DISTRIBUTE)
endif() endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
foreach(dist_op "prefetch_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op") foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endforeach() endforeach()
...@@ -216,7 +216,7 @@ if(WITH_DISTRIBUTE) ...@@ -216,7 +216,7 @@ if(WITH_DISTRIBUTE)
set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
endif() endif()
else() else()
set(DEPS_OPS ${DEPS_OPS} prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) set(DEPS_OPS ${DEPS_OPS} checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
endif() endif()
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
...@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax) ...@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine) op_library(tensorrt_engine_op DEPS tensorrt_engine)
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter) DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
analysis)
else() else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
endif() endif()
......
...@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel { ...@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
"Beta2 power accumulator should have 1 dimension"); "Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( if (ctx->GetInputsVarType("Grad")[0] ==
param_dims, ctx->GetInputDim("Grad"), framework::proto::VarType::LOD_TENSOR) {
"Param and Grad input of AdamOp should have same dimension"); PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of AdamOp should have same dimension");
}
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"), param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment1 input of AdamOp should have same dimension"); "Param and Moment1 input of AdamOp should have same dimension");
......
...@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto& grad = auto& grad =
Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad"); Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
// merge duplicated rows if any. // merge duplicated rows if any.
scatter::MergeAdd<DeviceContext, T> merge_func; scatter::MergeAdd<DeviceContext, T> merge_func;
auto grad_merge = auto grad_merge =
......
...@@ -26,14 +26,15 @@ namespace operators { ...@@ -26,14 +26,15 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
const int kMaxRank = 9; // The max rank of a tensor allowed in Fluid
__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size, __global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
int axis, int64_t n, int64_t* trg_idx, int axis, int64_t n, int64_t* trg_idx,
int64_t* med_ids) { int64_t* med_ids) {
int64_t index = threadIdx.x + blockDim.x * blockIdx.x; int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
if (index < n) { if (index < n) {
const int max_rank = 9; // Max rank of a tensor allow in Fluid int64_t shape_out_axis[kMaxRank - 1] = {0};
int64_t shape_out_axis[max_rank - 1] = {0}; int64_t dims_out_axis[kMaxRank - 1] = {0};
int64_t dims_out_axis[max_rank - 1] = {0};
int64_t tmp = index; int64_t tmp = index;
int64_t pos_in_axis = 0; int64_t pos_in_axis = 0;
int64_t i = dims_size - 2; int64_t i = dims_size - 2;
...@@ -125,10 +126,8 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> { ...@@ -125,10 +126,8 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
Tensor trg_idx_t; Tensor trg_idx_t;
int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace()); int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>( auto stream = ctx.cuda_device_context().stream();
ctx.device_context()) const int num_threads = PADDLE_CUDA_NUM_THREADS;
.stream();
int num_threads = PADDLE_CUDA_NUM_THREADS;
ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>( ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data); in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
......
...@@ -70,6 +70,7 @@ $$Out = values$$ ...@@ -70,6 +70,7 @@ $$Out = values$$
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker); REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>, REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
ops::AssignValueKernel<float>); ops::AssignValueKernel<float>);
...@@ -19,28 +19,28 @@ namespace operators { ...@@ -19,28 +19,28 @@ namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CPUDeviceContext>( void GetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t* num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates,
int64_t* num_accumulates_, int64_t* old_num_accumulates_) { int64_t* num_accumulates, int64_t* old_num_accumulates) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
*old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
*num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; *num_accumulates = in_num_accumulates->data<int64_t>()[0];
*num_updates_ = in_num_updates->data<int64_t>()[0]; *num_updates = in_num_updates->data<int64_t>()[0];
} }
template <> template <>
void SetAccumulators<paddle::platform::CPUDeviceContext>( void SetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_, const framework::ExecutionContext& ctx, int64_t num_updates,
int64_t num_accumulates_, int64_t old_num_accumulates_) { int64_t num_accumulates, int64_t old_num_accumulates) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates"); auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates"); auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates"); auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_; out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
out_num_accumulates->data<int64_t>()[0] = num_accumulates_; out_num_accumulates->data<int64_t>()[0] = num_accumulates;
out_num_updates->data<int64_t>()[0] = num_updates_; out_num_updates->data<int64_t>()[0] = num_updates;
} }
class AverageAccumulatesOp : public framework::OperatorWithKernel { class AverageAccumulatesOp : public framework::OperatorWithKernel {
...@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
AverageAccumulates Operator. AverageAccumulates Operator.
Accumulate the sum of parameter whtin sliding window. The size of sliding window is Accumulate the sum of parameter within sliding window. The size of sliding window is
determined by 'average_window', 'max_average_window' and 'min_average_window'. determined by 'average_window', 'max_average_window' and 'min_average_window'.
Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'. Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'. 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
......
...@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
float average_window = ctx.Attr<float>("average_window"); float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window"); int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window"); int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
min_average_window = PADDLE_ENFORCE_LE(min_average_window, max_average_window,
std::min<int64_t>(min_average_window, max_average_window); "min_average_window shouldn't be larger than "
"max_average_window");
// Get inputs // Get inputs
auto* param = ctx.Input<Tensor>("param"); auto* param = ctx.Input<Tensor>("param");
......
...@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum"); const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean"); const auto *mean = ctx.Input<Tensor>("Mean");
...@@ -111,11 +112,15 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -111,11 +112,15 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
unsigned flags = mkldnn::use_scale_shift; unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats; if (is_test) flags |= mkldnn::use_global_stats;
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor // create mkldnn memory from input x tensor
auto src_memory = mkldnn::memory::format input_format =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), x->format());
to_void_cast(x_data));
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// create primitive descriptor for batch norm forward // create primitive descriptor for batch norm forward
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
...@@ -249,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -249,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>; using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
// create mkldnn memory from input diff_y tensor // create mkldnn memory from input diff_y tensor
auto user_diff_dst_memory =
memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, mkldnn::memory::format dst_format =
mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
to_void_cast(diff_y_data));
auto user_diff_dst_memory = memory(
{{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
to_void_cast(diff_y_data));
// create mkldnn memory from input x tensor // create mkldnn memory from input x tensor
auto src_memory = mkldnn::memory::format input_format =
memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, platform::MKLDNNFormatForSize(src_tz.size(), x->format());
to_void_cast(x_data));
auto src_memory = memory(
{{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
to_void_cast(x_data));
// for diff_dst, try to use same format as dst in forward pass // for diff_dst, try to use same format as dst in forward pass
auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
......
...@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_with_relu",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Batch Normalization. Batch Normalization.
......
...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/beam_search_decode_op.h" #include <algorithm>
#include <string> #include <string>
#include "paddle/fluid/operators/beam_search_decode_op.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
...@@ -22,8 +24,11 @@ namespace operators { ...@@ -22,8 +24,11 @@ namespace operators {
struct BeamSearchDecodeFunctor { struct BeamSearchDecodeFunctor {
BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores, const LoDTensorArray& step_scores,
LoDTensor* id_tensor, LoDTensor* score_tensor) LoDTensor* id_tensor, LoDTensor* score_tensor,
: step_ids_origin_(step_ids), size_t beam_size, int end_id)
: beam_size_(beam_size),
end_id_(end_id),
step_ids_origin_(step_ids),
step_scores_origin_(step_scores), step_scores_origin_(step_scores),
id_tensor_(id_tensor), id_tensor_(id_tensor),
score_tensor_(score_tensor) { score_tensor_(score_tensor) {
...@@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor { ...@@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor {
// Copy all tensors in the input tensor array // Copy all tensors in the input tensor array
for (auto& step_id : step_ids_origin_) { for (auto& step_id : step_ids_origin_) {
framework::LoDTensor out; framework::LoDTensor out;
dev_ctx->Wait(); if (step_id.numel() > 0) {
framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); dev_ctx->Wait();
dev_ctx->Wait(); framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
dev_ctx->Wait();
}
out.set_lod(step_id.lod()); out.set_lod(step_id.lod());
step_ids_.push_back(out); step_ids_.push_back(out);
...@@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor { ...@@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor {
// Copy all tensors in the input tensor array // Copy all tensors in the input tensor array
for (auto& step_score : step_scores_origin_) { for (auto& step_score : step_scores_origin_) {
framework::LoDTensor out; framework::LoDTensor out;
dev_ctx->Wait(); if (step_score.numel() > 0) {
framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out); dev_ctx->Wait();
dev_ctx->Wait(); framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
&out);
dev_ctx->Wait();
}
out.set_lod(step_score.lod()); out.set_lod(step_score.lod());
step_scores_.push_back(out); step_scores_.push_back(out);
...@@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor { ...@@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor {
void operator()() const; void operator()() const;
bool tensor_on_gpu_; bool tensor_on_gpu_;
size_t beam_size_;
int end_id_;
const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_ids_origin_;
const LoDTensorArray& step_scores_origin_; const LoDTensorArray& step_scores_origin_;
LoDTensorArray step_ids_ = LoDTensorArray(); LoDTensorArray step_ids_ = LoDTensorArray();
...@@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor { ...@@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor {
template <typename T> template <typename T>
void BeamSearchDecodeFunctor::operator()() const { void BeamSearchDecodeFunctor::operator()() const {
BeamSearchDecoder<T> beam_search_decoder; BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
// Check if the tensor is on GPU. If so, use the CPU copy instead // Check if the tensor is on GPU. If so, use the CPU copy instead
if (tensor_on_gpu_) { if (tensor_on_gpu_) {
beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_, beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
score_tensor_); score_tensor_);
} else { } else {
beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_, beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
id_tensor_, score_tensor_); id_tensor_, score_tensor_);
} }
} }
...@@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
"Level of LodTensor should be 2"); "Level of LodTensor should be 2");
} }
size_t beam_size = ctx.Attr<int>("beam_size");
int end_id = ctx.Attr<int>("end_id");
// prepare output // prepare output
LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds"); LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores"); LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
framework::VisitDataType( framework::VisitDataType(
framework::ToDataType(scores->at(0).type()), framework::ToDataType(scores->at(0).type()),
BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores)); BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
beam_size, end_id));
} }
}; };
...@@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("Ids", AddInput("Ids",
"(LodTensorArray)" "(LodTensorArray)"
"score of the candidate words in each step"); "The LodTensorArray containing the selected ids of all steps");
AddInput("Scores", AddInput("Scores",
"(LodTensorArray)" "(LodTensorArray)"
"score of the candidate words in each step"); "The LodTensorArray containing the selected scores of all steps");
AddOutput("SentenceIds", AddOutput(
"(LodTensor)" "SentenceIds",
"All possible result sentences of word ids"); "(LodTensor)"
AddOutput("SentenceScores", "An LodTensor containing all generated id sequences for all source "
"(LodTensor)" "sentences");
"All possible result sentences of word scores"); AddOutput(
"SentenceScores",
"(LodTensor)"
"An LodTensor containing scores corresponding to Output(SentenceIds)");
AddAttr<int>("beam_size", "beam size for beam search");
AddAttr<int>("end_id",
"the token id which indicates the end of a sequence");
AddComment(R"DOC( AddComment(R"DOC(
Pack the result of Beam search op into SentenceIds and SentenceScores. Beam Search Decode Operator. This Operator constructs the full hypotheses for
each source sentence by walking back along the LoDTensorArray Input(ids)
whose lods can be used to restore the path in the beam search tree.
The Output(SentenceIds) and Output(SentenceScores) separately contain the
generated id sequences and the corresponding scores. The shapes and lods of the
two LodTensor are same. The lod level is 2 and the two levels separately
indicate how many hypotheses each source sentence has and how many ids each
hypothesis has.
)DOC"); )DOC");
} }
}; };
...@@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference { ...@@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
void operator()(const framework::OpDesc& op_desc, void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override { framework::BlockDesc* block) const override {
for (auto& o : op_desc.Output("SentenceIds")) { for (auto& o : op_desc.Output("SentenceIds")) {
block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
} }
for (auto& o : op_desc.Output("SentenceScores")) { for (auto& o : op_desc.Output("SentenceScores")) {
block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
} }
} }
}; };
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor; ...@@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor;
using LoDTensorArray = framework::LoDTensorArray; using LoDTensorArray = framework::LoDTensorArray;
// all the lod have 2 levels. // all the lod have 2 levels.
// The First is source level, the second is sentence level. // The first is source level, the second is sentence level.
// source level describe how many candidate words for this source. // source level describe how many prefixes (branchs) for each source sentece
// sentence level describe these candidates belong to which prefix // (beam). sentence level describe how these candidates belong to the prefixes.
const size_t kSourceLevel = 0; const size_t kSourceLevel = 0;
const size_t kSentenceLevel = 1; const size_t kSentenceLevel = 1;
template <typename T>
struct BeamNode {
BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
~BeamNode() {
if (parent_) {
parent_->DropKid(this);
if (parent_->kids_.size() == 0UL) {
delete parent_;
}
}
VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
}
void AppendTo(BeamNode* parent) {
parent_ = parent;
parent->kids_.insert(this);
}
void DropKid(BeamNode* kid) { kids_.erase(kid); }
BeamNode* parent_ = nullptr;
std::unordered_set<BeamNode*> kids_;
int64_t word_id_;
T score_;
};
template <typename T>
using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
template <typename T> template <typename T>
struct Sentence { struct Sentence {
std::vector<int64_t> word_ids; std::vector<int64_t> word_ids;
...@@ -72,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>; ...@@ -72,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>;
template <typename T> template <typename T>
struct BeamSearchDecoder { struct BeamSearchDecoder {
/** BeamSearchDecoder(size_t beam_size, int end_id)
* make a BeamNode and all it's related prefix BeanNode into a Sentence. : beam_size_(beam_size), end_id_(end_id) {}
*/
Sentence<T> MakeSentence(const BeamNode<T>* node) const;
/**
* Param:
* cur_ids: LoDTensor of One step for word ID
* cur_scores: LoDTensor of One Step for word score
* prefixes_list: prefixes for each source sentence.
* sentence_vector_list: result sentence_vector for each source sentence.
* Return:
* a new prefixes list for each source of current step
*/
std::vector<BeamNodeVector<T>> PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores,
std::vector<BeamNodeVector<T>>* prefixes_list,
std::vector<SentenceVector<T>>* sentence_vector_list) const;
/** /**
* convert the result sentence_vector for each source sentence into two * convert the result sentence_vector for each source sentence into two
...@@ -100,107 +56,30 @@ struct BeamSearchDecoder { ...@@ -100,107 +56,30 @@ struct BeamSearchDecoder {
* sentence_vector_list: sentence_vector for each source sentence. * sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id. * id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score. * score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/ */
void ConvertSentenceVectorToLodTensor( void ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor, std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
LoDTensor* score_tensor) const; LoDTensor* score_tensor, bool reverse = true,
bool sort_by_score = true) const;
/** /**
* Pack all steps of id/score LodTensor into sentence LoDTensor * Gather the hypotheses for each source sentence by backtrace though the
* it's main logic is: * LoDTensorArray step_ids whose lods reserve the path in the tree.
* ```python
* prefix
* result_sentence
* result_lod_tensor
*
* for (step in steps):
* prefix = PackTwoSteps(prefix, step, &result_sentence)
* ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
* ```
*/ */
void PackAllSteps(const LoDTensorArray& step_ids, void Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores, LoDTensor* id_tensor, const LoDTensorArray& step_scores, LoDTensor* id_tensor,
LoDTensor* score_tensor) const; LoDTensor* score_tensor) const;
};
template <typename T>
Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
Sentence<T> sentence;
while (node != nullptr) {
sentence.word_ids.emplace_back(node->word_id_);
sentence.scores.emplace_back(node->score_);
node = node->parent_;
}
std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
return sentence;
}
template <typename T>
std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
const LoDTensor& cur_ids, const LoDTensor& cur_scores,
std::vector<BeamNodeVector<T>>* prefixes_list,
std::vector<SentenceVector<T>>* sentence_vector_list) const {
std::vector<BeamNodeVector<T>> result;
for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1; size_t beam_size_;
++src_idx) { int end_id_;
size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx]; };
size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
BeamNodeVector<T> beam_nodes;
// if prefixes size is 0, it means this is the first step. In this step,
// all candidate id is the start of candidate sentences.
if (prefixes_list->empty()) {
PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
cur_ids.lod().at(kSentenceLevel).back(),
"in the first step");
for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
}
} else {
BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
"prefix and candidate set number should be the same");
auto candidate_offset = cur_ids.lod()[kSentenceLevel];
for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
size_t candidate_start = candidate_offset[src_start + prefix_idx];
size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
if (candidate_start == candidate_end) {
VLOG(3) << "this sentence has no more candidate, "
"add to result sentence and rm it from beam tree";
sentence_vector.push_back(MakeSentence(prefix.get()));
prefix.reset();
} else {
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end; ++candidate_idx) {
auto* candidate =
new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
cur_scores.data<T>()[candidate_idx]);
candidate->AppendTo(prefix.get());
beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
}
prefix.release();
}
}
}
result.push_back(std::move(beam_nodes));
}
return result;
}
template <typename T> template <typename T>
void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor( void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor, std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
LoDTensor* score_tensor) const { LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
size_t src_num = sentence_vector_list.size(); size_t src_num = sentence_vector_list.size();
PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
...@@ -211,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor( ...@@ -211,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
std::vector<T> score_data; std::vector<T> score_data;
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
if (sort_by_score) {
sort(sentence_vector_list[src_idx].begin(),
sentence_vector_list[src_idx].end(),
[reverse](const Sentence<T>& a, const Sentence<T>& b) {
if (reverse)
return a.scores.front() > b.scores.front();
else
return a.scores.back() > b.scores.back();
});
}
for (Sentence<T>& sentence : sentence_vector_list[src_idx]) { for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
id_data.insert(id_data.end(), sentence.word_ids.begin(), if (reverse) {
sentence.word_ids.end()); id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
score_data.insert(score_data.end(), sentence.scores.begin(), sentence.word_ids.rend());
sentence.scores.end()); score_data.insert(score_data.end(), sentence.scores.rbegin(),
sentence.scores.rend());
} else {
id_data.insert(id_data.end(), sentence.word_ids.begin(),
sentence.word_ids.end());
score_data.insert(score_data.end(), sentence.scores.begin(),
sentence.scores.end());
}
sentence_level_lod.push_back(sentence_level_lod.back() + sentence_level_lod.push_back(sentence_level_lod.back() +
sentence.word_ids.size()); sentence.word_ids.size());
} }
...@@ -243,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor( ...@@ -243,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
} }
template <typename T> template <typename T>
void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids, void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
const LoDTensorArray& step_scores, const LoDTensorArray& step_scores,
LoDTensor* id_tensor, LoDTensor* id_tensor,
LoDTensor* score_tensor) const { LoDTensor* score_tensor) const {
PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
"step_ids and step_scores should be the same"); "step_ids and step_scores should be the same");
const size_t step_num = step_ids.size(); const size_t step_num = step_ids.size();
const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
std::vector<SentenceVector<T>> sentence_vector_list(
src_num, SentenceVector<T>(beam_size_));
std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
for (int step_id = step_num - 1; step_id >= 0; --step_id) {
auto& cur_ids = step_ids.at(step_id);
auto& cur_scores = step_scores.at(step_id);
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
// for each source sentence
auto& sentence_vector = sentence_vector_list.at(src_idx);
auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
if (prefix_idx_vector.empty()) { // be finished and pruned at this step
// or the last time step
for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
++prefix_idx) {
size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
size_t candidate_end =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
for (size_t candidate_idx = candidate_start;
candidate_idx < candidate_end; ++candidate_idx) {
prefix_idx_vector.push_back(prefix_idx);
size_t idx = prefix_idx_vector.size() - 1;
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
}
} else { // use prefix_idx_vector to backtrace
size_t src_candidate_start =
cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
size_t prefix_idx = src_prefix_start;
size_t candidate_num =
cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
cur_ids.lod().at(kSentenceLevel)[prefix_idx];
for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
auto candidate_idx = prefix_idx_vector.at(idx);
auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
auto cur_score = cur_scores.data<T>()[candidate_idx];
if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
// to skip redundant end tokens
sentence_vector.at(idx).word_ids.push_back(cur_id);
sentence_vector.at(idx).scores.push_back(cur_score);
}
PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0"); while (src_candidate_start + candidate_num <=
candidate_idx) { // search the corresponding prefix
// previous prefixes for each step, prefix_idx++;
// the init length is 0, means this is the first step. candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
std::vector<BeamNodeVector<T>> beamnode_vector_list(0); cur_ids.lod().at(kSentenceLevel)[prefix_idx];
std::vector<SentenceVector<T>> sentence_vector_list(src_num); }
prefix_idx_vector.at(idx) = prefix_idx;
// pack all steps for one batch first, then another batch }
for (size_t step_id = 0; step_id < step_num; ++step_id) { }
beamnode_vector_list =
PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
&beamnode_vector_list, &sentence_vector_list);
}
// append last beam_node to result
for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
beam_node.reset();
} }
} }
ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
score_tensor); score_tensor, true, true);
} }
} // namespace operators } // namespace operators
......
...@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD; ...@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
using LoDTensor = paddle::framework::LoDTensor; using LoDTensor = paddle::framework::LoDTensor;
using LoDTensorArray = paddle::framework::LoDTensorArray; using LoDTensorArray = paddle::framework::LoDTensorArray;
template <typename T>
using BeamNode = paddle::operators::BeamNode<T>;
template <typename T> template <typename T>
using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>; using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
template <typename T> template <typename T>
using Sentence = paddle::operators::Sentence<T>; using Sentence = paddle::operators::Sentence<T>;
template <typename T> template <typename T>
using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
template <typename T>
using SentenceVector = paddle::operators::SentenceVector<T>; using SentenceVector = paddle::operators::SentenceVector<T>;
namespace paddle { namespace paddle {
...@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0, ...@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
} // namespace test } // namespace test
} // namespace paddle } // namespace paddle
TEST(BeamSearchDecodeOp, DeleteBeamNode) { TEST(BeamSearchDecodeOp, Backtrace) {
auto* root = new BeamNode<float>(0, 0);
auto* b1 = new BeamNode<float>(1, 1);
auto* b2 = new BeamNode<float>(2, 2);
auto* b3 = new BeamNode<float>(3, 3);
b1->AppendTo(root);
b2->AppendTo(root);
b3->AppendTo(b1);
delete b3;
delete b2;
}
TEST(BeamSearchDecodeOp, MakeSentence) {
auto* root = new BeamNode<float>(0, 0);
auto* b1 = new BeamNode<float>(1, 1);
auto* end = new BeamNode<float>(2, 2);
b1->AppendTo(root);
end->AppendTo(b1);
BeamSearchDecoder<float> helper;
Sentence<float> sentence = helper.MakeSentence(end);
delete end;
std::vector<int64_t> expect_ids = {0, 1, 2};
ASSERT_EQ(sentence.word_ids, expect_ids);
std::vector<float> expect_scores = {0, 1, 2};
ASSERT_EQ(sentence.scores, expect_scores);
}
TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
CPUPlace place;
LoDTensorArray ids;
LoDTensorArray scores;
paddle::test::GenerateExample(
std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
std::vector<BeamNodeVector<float>> beamnode_vector_list;
std::vector<SentenceVector<float>> sentence_vector_list(
2, SentenceVector<float>());
BeamSearchDecoder<float> helper;
beamnode_vector_list = helper.PackTwoSteps(
ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(beamnode_vector_list.size(), 2UL);
ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
}
TEST(BeamSearchDecodeOp, PackTwoSteps) {
CPUPlace place;
// first source has three prefix
BeamNodeVector<float> source0_prefixes;
source0_prefixes.push_back(
std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
source0_prefixes.push_back(
std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
source0_prefixes.push_back(
std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
// second source has two prefix
BeamNodeVector<float> source1_prefixes;
source1_prefixes.push_back(
std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
source1_prefixes.push_back(
std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
std::vector<BeamNodeVector<float>> beamnode_vector_list;
std::vector<SentenceVector<float>> sentence_vector_list(
2, SentenceVector<float>());
beamnode_vector_list.push_back(std::move(source0_prefixes));
beamnode_vector_list.push_back(std::move(source1_prefixes));
// generate data for one step
LoDTensorArray ids;
LoDTensorArray scores;
paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
std::vector<size_t>{0, 1, 1, 3, 4, 5},
std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
BeamSearchDecoder<float> helper1;
beamnode_vector_list = helper1.PackTwoSteps(
ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
}
TEST(BeamSearchDecodeOp, PackAllSteps) {
CPUPlace place; CPUPlace place;
// we will constuct a sample data with 3 steps and 2 source sentences // Construct sample data with 5 steps and 2 source sentences
// beam_size = 2, start_id = 0, end_id = 1
LoDTensorArray ids; LoDTensorArray ids;
LoDTensorArray scores; LoDTensorArray scores;
paddle::test::GenerateExample( paddle::test::GenerateExample(
std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6}, std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores); std::vector<int>{0, 0}, &ids, &scores); // start with start_id
paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
std::vector<size_t>{0, 2, 4},
std::vector<int>{2, 3, 4, 5}, &ids, &scores);
paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
std::vector<size_t>{0, 2, 2, 4, 4},
std::vector<int>{3, 1, 5, 4}, &ids, &scores);
paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
std::vector<size_t>{0, 1, 2, 3, 4},
std::vector<int>{1, 1, 3, 5}, &ids, &scores);
paddle::test::GenerateExample( paddle::test::GenerateExample(
std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6}, std::vector<size_t>{0, 2, 4},
std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores); std::vector<size_t>{0, 0, 0, 2,
paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6}, 2}, // the branchs of the first source sentence
std::vector<size_t>{0, 0, 1, 2, 3, 4, 5}, // are pruned since finished
std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores); std::vector<int>{5, 1},
&ids, &scores);
ASSERT_EQ(ids.size(), 3UL); ASSERT_EQ(ids.size(), 5UL);
ASSERT_EQ(scores.size(), 3UL); ASSERT_EQ(scores.size(), 5UL);
BeamSearchDecoder<float> helper; BeamSearchDecoder<float> helper(2, 1); // beam_size = 2, end_id = 1
LoDTensor id_tensor; LoDTensor id_tensor;
LoDTensor score_tensor; LoDTensor score_tensor;
helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor); helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
LoD lod = id_tensor.lod(); LoD lod = id_tensor.lod();
std::vector<size_t> expect_source_lod = {0, 4, 8}; std::vector<size_t> expect_source_lod = {0, 2, 4};
EXPECT_EQ(lod[0], expect_source_lod); EXPECT_EQ(lod[0], expect_source_lod);
std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19}; std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
EXPECT_EQ(lod[1], expect_sentence_lod); EXPECT_EQ(lod[1], expect_sentence_lod);
// 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4 std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 5, 3, 5, 0, 4, 5, 3, 1};
4, 3, 2, 4, 4, 3, 6, 5, 4};
ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size())); ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
for (size_t i = 0; i < expect_data.size(); ++i) { for (size_t i = 0; i < expect_data.size(); ++i) {
ASSERT_EQ(id_tensor.data<int64_t>()[i], ASSERT_EQ(id_tensor.data<int64_t>()[i],
......
...@@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids, void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
framework::LoDTensor *selected_ids, framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) { framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod()); auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_]; auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(); auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
auto selected_items = ToMap(items, high_level.back()); auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:"; VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) { for (size_t i = 0; i < selected_items.size(); ++i) {
...@@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
VLOG(3) << ItemToString(item); VLOG(3) << ItemToString(item);
} }
} }
PruneEndidCandidates(pre_ids, &selected_items);
PruneEndBeams(pre_ids, &selected_items);
// calculate the output tensor's height // calculate the output tensor's height
size_t num_instances = std::accumulate( size_t num_instances = std::accumulate(
std::begin(selected_items), std::end(selected_items), 0, std::begin(selected_items), std::end(selected_items), 0,
...@@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
size_t low_offset = 0; size_t low_offset = 0;
for (auto &items : selected_items) { for (auto &items : selected_items) {
low_level.push_back(low_offset); low_level.push_back(low_offset);
sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
if (a.offset < b.offset) {
return true;
}
return a.id < b.id;
});
for (auto &item : items) { for (auto &item : items) {
ids_data[low_offset] = item.id; ids_data[low_offset] = item.id;
scores_data[low_offset] = item.score; scores_data[low_offset] = item.score;
...@@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
selected_scores->set_lod(lod); selected_scores->set_lod(lod);
} }
int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
std::vector<std::vector<Item>> *items) { std::vector<std::vector<Item>> *items) {
auto *pre_ids_data = pre_ids.data<int64_t>(); auto *pre_ids_data = pre_ids.data<int64_t>();
auto abs_lod = framework::ToAbsOffset(ids_->lod());
int res = 0; auto &high_level = abs_lod[lod_level_];
for (size_t offset = 0; offset < items->size(); offset++) { for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
auto prefix_id = pre_ids_data[offset]; size_t src_prefix_start = high_level[src_idx];
if (prefix_id == end_id_) { size_t src_prefix_end = high_level[src_idx + 1];
items->at(offset).clear(); bool finish_flag = true;
} else { for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
res++; for (auto &item : items->at(offset)) {
if (item.id != static_cast<size_t>(end_id_) ||
pre_ids_data[offset] != end_id_) {
finish_flag = false;
break;
}
}
if (!finish_flag) break;
}
if (finish_flag) { // all branchs of the beam (source sentence) end and
// prune this beam
for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
items->at(offset).clear();
} }
} }
return res;
} }
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap( std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
...@@ -115,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap( ...@@ -115,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
return result; return result;
} }
std::vector<std::vector<BeamSearch::Item>> std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
BeamSearch::SelectTopBeamSizeItems() { const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores) {
std::vector<std::vector<Item>> result; std::vector<std::vector<Item>> result;
std::vector<Item> items; std::vector<Item> items;
// for each source sentence, select the top beam_size items across all // for each source sentence, select the top beam_size items across all
// candidate sets. // candidate sets.
while (NextItemSet(&items)) { while (NextItemSet(pre_ids, pre_scores, &items)) {
std::nth_element(std::begin(items), std::begin(items) + beam_size_, std::nth_element(
std::end(items), [](const Item &a, const Item &b) { std::begin(items), std::begin(items) + beam_size_, std::end(items),
// TODO(superjom) make score's comparation customizable. [](const Item &a, const Item &b) { return a.score > b.score; });
// partial sort in descending order
return a.score > b.score;
});
// prune the top beam_size items. // prune the top beam_size items.
if (items.size() > beam_size_) { if (items.size() > beam_size_) {
items.resize(beam_size_); items.resize(beam_size_);
...@@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() { ...@@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() {
} }
// the candidates of a source // the candidates of a source
bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) { bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
const framework::LoDTensor &pre_scores,
std::vector<BeamSearch::Item> *items) {
if (sent_offset_ >= ids_->NumElements(lod_level_)) { if (sent_offset_ >= ids_->NumElements(lod_level_)) {
return false; return false;
} }
...@@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) { ...@@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
instance_dim *= ids.dims()[i]; instance_dim *= ids.dims()[i];
} }
auto *pre_ids_data = pre_ids.data<int64_t>();
auto *pre_scores_data = pre_scores.data<float>();
items->clear(); items->clear();
items->reserve(framework::product(ids.dims())); items->reserve(framework::product(ids.dims()));
for (size_t offset = abs_lod[lod_level_][sent_offset_]; for (size_t offset = abs_lod[lod_level_][sent_offset_];
offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) { offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
for (size_t d = 0; d < instance_dim; d++) { auto pre_id = pre_ids_data[offset];
const size_t dim_offset = offset * instance_dim + d; auto pre_score = pre_scores_data[offset];
items->emplace_back(offset, ids_data[dim_offset], if (pre_id == end_id_) {
scores_data[dim_offset]); // Allocate all probability mass to eos_id for finished branchs and the
// other candidate ids can be ignored.
items->emplace_back(offset, end_id_, pre_score);
} else {
for (size_t d = 0; d < instance_dim; d++) {
const size_t dim_offset = offset * instance_dim + d;
items->emplace_back(offset, ids_data[dim_offset],
scores_data[dim_offset]);
}
} }
} }
...@@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
// inputs and outputs stored in proto // inputs and outputs stored in proto
AddInput("pre_ids", "ids in previous step"); AddInput("pre_ids",
AddInput("ids", "a LoDTensor of shape of [None,k]"); "(LoDTensor) The LoDTensor containing the selected ids at the "
"previous step. It should be a tensor with shape (batch_size, 1) "
"and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
"thefirst step.");
AddInput("pre_scores",
"(LoDTensor) The LoDTensor containing the accumulated "
"scores corresponding to the selected ids at the previous step.");
AddInput("ids",
"(LoDTensor) The LoDTensor containing the candidates ids. Its "
"shape should be (batch_size * beam_size, K), where K supposed to "
"be beam_size.");
AddInput("scores", AddInput("scores",
"a LoDTensor that has the same shape and LoD with `ids`"); "(LoDTensor) The LodTensor containing the accumulated scores "
"corresponding to Input(ids) and its shape is the same as the "
"shape of Input(ids).");
AddOutput("selected_ids", AddOutput("selected_ids",
"a LoDTensor that stores the IDs selected by beam search"); "A LodTensor that stores the IDs selected by beam search.");
AddOutput( AddOutput("selected_scores",
"selected_scores", "A LoDTensor containing the accumulated scores corresponding to "
"a LoDTensor that has the same shape and LoD with `selected_ids`"); "Output(selected_ids).");
// Attributes stored in AttributeMap // Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor"); AddAttr<int>("level", "the level of LoDTensor");
...@@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int>("end_id", AddAttr<int>("end_id",
"the token id which indicates the end of a sequence"); "the token id which indicates the end of a sequence");
AddComment( AddComment(R"DOC(
"This is a beam search operator that help to generate sequences."); This operator does the search in beams for one time step.
Specifically, it selects the top-K candidate word ids of current step from
Input(ids) according to their Input(scores) for all source sentences,
where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
are the output of beam_search at previous step, they are needed for special use
to handle ended candidate translations. The paths linking prefixes and selected
candidates are organized and reserved in lod.
Note that the Input(scores) passed in should be accumulated scores, and
length penalty should be done with extra operators before calculating the
accumulated scores if needed, also suggest finding top-K before it and
using the top-K candidates following.
)DOC");
} }
}; };
...@@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference { ...@@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
for (auto &o : op_desc.Output("selected_ids")) { for (auto &o : op_desc.Output("selected_ids")) {
block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); auto &selected_ids = block->FindRecursiveOrCreateVar(o);
selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
} }
for (auto &o : op_desc.Output("selected_scores")) { for (auto &o : op_desc.Output("selected_scores")) {
block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR); auto &selected_scores = block->FindRecursiveOrCreateVar(o);
selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
} }
} }
}; };
......
...@@ -132,6 +132,7 @@ class BeamSearch { ...@@ -132,6 +132,7 @@ class BeamSearch {
* that means no candidates is provided, and the task will stop running. * that means no candidates is provided, and the task will stop running.
*/ */
void operator()(const framework::LoDTensor& pre_ids, void operator()(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
framework::LoDTensor* selected_ids, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores); framework::LoDTensor* selected_scores);
/* /*
...@@ -153,14 +154,16 @@ class BeamSearch { ...@@ -153,14 +154,16 @@ class BeamSearch {
protected: protected:
/* /*
* Delete all the records that follows the end token. * Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/ */
int PruneEndidCandidates(const framework::LoDTensor& pre_ids, void PruneEndBeams(const framework::LoDTensor& pre_ids,
std::vector<std::vector<Item>>* items); std::vector<std::vector<Item>>* items);
/* /*
* Transform the items into a map whose key is offset, value is the items. * Transform the items into a map whose key is offset, value is the items.
* NOTE low performance * NOTE low performance.
*/ */
std::vector<std::vector<Item>> ToMap( std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs, size_t element_num); const std::vector<std::vector<Item>>& inputs, size_t element_num);
...@@ -168,12 +171,16 @@ class BeamSearch { ...@@ -168,12 +171,16 @@ class BeamSearch {
/* /*
* For each source, select top beam_size records. * For each source, select top beam_size records.
*/ */
std::vector<std::vector<Item>> SelectTopBeamSizeItems(); std::vector<std::vector<Item>> SelectTopBeamSizeItems(
const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores);
/* /*
* Get the items of next source sequence, return false if no remaining items. * Get the items of next source sequence, return false if no remaining items.
*/ */
bool NextItemSet(std::vector<Item>* items); bool NextItemSet(const framework::LoDTensor& pre_ids,
const framework::LoDTensor& pre_scores,
std::vector<Item>* items);
private: private:
size_t beam_size_; size_t beam_size_;
...@@ -192,24 +199,25 @@ template <typename DeviceContext, typename T> ...@@ -192,24 +199,25 @@ template <typename DeviceContext, typename T>
class BeamSearchOpKernel : public framework::OpKernel<T> { class BeamSearchOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* ids_var = context.Input<framework::LoDTensor>("ids"); auto* ids = context.Input<framework::LoDTensor>("ids");
auto* scores_var = context.Input<framework::LoDTensor>("scores"); auto* scores = context.Input<framework::LoDTensor>("scores");
auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids"); auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
PADDLE_ENFORCE_NOT_NULL(ids_var); auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
PADDLE_ENFORCE_NOT_NULL(scores_var); PADDLE_ENFORCE_NOT_NULL(ids);
PADDLE_ENFORCE_NOT_NULL(pre_ids_var); PADDLE_ENFORCE_NOT_NULL(scores);
PADDLE_ENFORCE_NOT_NULL(pre_ids);
PADDLE_ENFORCE_NOT_NULL(pre_scores);
size_t level = context.Attr<int>("level"); size_t level = context.Attr<int>("level");
size_t beam_size = context.Attr<int>("beam_size"); size_t beam_size = context.Attr<int>("beam_size");
int end_id = context.Attr<int>("end_id"); int end_id = context.Attr<int>("end_id");
BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id); BeamSearch alg(*ids, *scores, level, beam_size, end_id);
auto selected_ids_var = auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
context.Output<framework::LoDTensor>("selected_ids"); auto selected_scores =
auto selected_scores_var =
context.Output<framework::LoDTensor>("selected_scores"); context.Output<framework::LoDTensor>("selected_scores");
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids);
PADDLE_ENFORCE_NOT_NULL(selected_scores_var); PADDLE_ENFORCE_NOT_NULL(selected_scores);
alg(*pre_ids_var, selected_ids_var, selected_scores_var); alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -30,7 +30,7 @@ using std::endl; ...@@ -30,7 +30,7 @@ using std::endl;
void CreateInput(LoDTensor* ids, LoDTensor* scores) { void CreateInput(LoDTensor* ids, LoDTensor* scores) {
LoD lod; LoD lod;
vector<size_t> level0({0, 1, 4}); vector<size_t> level0({0, 2, 4});
vector<size_t> level1({0, 1, 2, 3, 4}); vector<size_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0); lod.push_back(level0);
lod.push_back(level1); lod.push_back(level1);
...@@ -64,17 +64,22 @@ TEST(beam_search_op, run) { ...@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
pre_ids.mutable_data<int64_t>(place)[i] = i + 1; pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
} }
LoDTensor pre_scores;
pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
for (int i = 0; i < 4; i++) {
pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
}
BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0); BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
LoDTensor sids, sscores; LoDTensor sids, sscores;
beamsearch(pre_ids, &sids, &sscores); beamsearch(pre_ids, pre_scores, &sids, &sscores);
LOG(INFO) << "score: " << sscores << endl; LOG(INFO) << "score: " << sscores << endl;
ASSERT_EQ(sids.lod(), sscores.lod()); ASSERT_EQ(sids.lod(), sscores.lod());
vector<int> tids({2, 4, 3, 8}); vector<int> tids({4, 2, 3, 8});
vector<float> tscores({0.3, 0.5, 0.9, 0.7}); vector<float> tscores({0.5, 0.6, 0.9, 0.7});
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
ASSERT_EQ(tids[i], sids.data<int64_t>()[i]); ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/send_recv_util.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace operators {
class CheckpointNotifyOp : public framework::OperatorBase {
public:
CheckpointNotifyOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
std::string dir = Attr<std::string>("dir");
std::string lookup_table_name = Attr<std::string>("lookup_table");
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>();
for (size_t i = 0; i < epmap.size(); i++) {
auto lookup_table_save_dir =
string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
<< " and dir:" << dir << " to " << epmap[i];
}
rpc_client->Wait();
}
};
class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Parameter Server endpoints in the order")
.SetDefault({"127.0.0.1:6164"});
AddAttr<std::string>(
"dir", "(string, default '') indicate the folder checkpoint will use");
AddAttr<std::string>("lookup_table",
"(string, default '') the lookup table name");
AddComment(R"DOC(
CheckpointNotify operator
This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
the parameter server.
)DOC");
}
};
class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp,
paddle::framework::EmptyGradOpMaker,
ops::CheckpointNotifyOpMaker,
ops::CheckpointNotifyOpShapeInference);
...@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( ...@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
namespace ops = paddle::operators; namespace ops = paddle::operators;
// conv2d_transpose
REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker, ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
...@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>); double>);
// conv3d_transpose
REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
ops::Conv3DTransposeOpMaker, ops::Conv3DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
...@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL( ...@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>, ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>); double>);
// depthwise conv2d_transpose
REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
ops::Conv2DTransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
depthwise_conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
double>);
...@@ -15,25 +15,28 @@ limitations under the License. */ ...@@ -15,25 +15,28 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL( // conv2d
conv2d_transpose, REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>, ops::GemmConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>); ops::GemmConvTransposeKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
conv2d_transpose_grad, ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext, ops::GemmConvTransposeGradKernel<CUDA, double>);
float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext, // conv3d
double>); REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
ops::GemmConvTransposeKernel<CUDA, float>,
REGISTER_OP_CUDA_KERNEL( ops::GemmConvTransposeKernel<CUDA, double>);
conv3d_transpose, REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>, ops::GemmConvTransposeGradKernel<CUDA, float>,
ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>); ops::GemmConvTransposeGradKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(
conv3d_transpose_grad, // depthwise conv2d
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext, REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
float>, ops::DepthwiseConvTransposeKernel<CUDA, float>,
ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext, ops::DepthwiseConvTransposeKernel<CUDA, double>);
double>); REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
...@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
} }
} }
}; };
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
for (auto v : dilations) {
PADDLE_ENFORCE_EQ(v, 1);
}
output->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, output, static_cast<T>(0));
math::DepthwiseConvInputGradFunctor<DeviceContext, T>
depthwiseConvInputGrad;
depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
output);
}
};
template <typename DeviceContext, typename T>
class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
auto& dev_ctx = context.template device_context<DeviceContext>();
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
if (input_grad) {
math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
input_grad);
}
if (filter_grad) {
math::SetConstant<DeviceContext, T> set_zero;
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
depthwiseConvFilterGrad;
depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
filter_grad);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { ...@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
} }
}; };
template <class T>
bool DistPairDescend(std::tuple<int, int, T> pair1,
std::tuple<int, int, T> pair2) {
return std::get<2>(pair1) > std::get<2>(pair2);
}
template <typename T> template <typename T>
class BipartiteMatchKernel : public framework::OpKernel<T> { class BipartiteMatchKernel : public framework::OpKernel<T> {
public: public:
...@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
// The match_dist must be initialized to 0 at first. // The match_dist must be initialized to 0 at first.
void BipartiteMatch(const Tensor& dist, int* match_indices, void BipartiteMatch(const Tensor& dist, int* match_indices,
T* match_dist) const { T* match_dist) const {
constexpr T kEPS = static_cast<T>(1e-6);
PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2."); PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
int64_t row = dist.dims()[0]; int64_t row = dist.dims()[0];
int64_t col = dist.dims()[1]; int64_t col = dist.dims()[1];
auto* dist_data = dist.data<T>(); auto* dist_data = dist.data<T>();
std::vector<int> row_pool; // Test result: When row==130 the speed of these two methods almost the same
for (int i = 0; i < row; ++i) { if (row >= 130) {
row_pool.push_back(i); std::vector<std::tuple<int, int, T>> match_pair;
}
while (row_pool.size() > 0) { for (int64_t i = 0; i < row; ++i) {
int max_idx = -1; for (int64_t j = 0; j < col; ++j) {
int max_row_idx = -1; match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
T max_dist = -1;
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
continue;
} }
for (size_t k = 0; k < row_pool.size(); ++k) { }
int m = row_pool[k]; std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
// distance is 0 between m-th row and j-th column std::vector<int> row_indices(row, -1);
if (dist_data[m * col + j] < kEPS) {
int64_t idx = 0;
for (int64_t k = 0; k < row * col; ++k) {
int64_t i = std::get<0>(match_pair[k]);
int64_t j = std::get<1>(match_pair[k]);
T dist = std::get<2>(match_pair[k]);
if (idx >= row) {
break;
}
if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
match_indices[j] = i;
row_indices[i] = j;
match_dist[j] = dist;
idx += 1;
}
}
} else {
constexpr T kEPS = static_cast<T>(1e-6);
std::vector<int> row_pool;
for (int i = 0; i < row; ++i) {
row_pool.push_back(i);
}
while (row_pool.size() > 0) {
int max_idx = -1;
int max_row_idx = -1;
T max_dist = -1;
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
continue; continue;
} }
if (dist_data[m * col + j] > max_dist) { for (size_t k = 0; k < row_pool.size(); ++k) {
max_idx = j; int m = row_pool[k];
max_row_idx = m; // distance is 0 between m-th row and j-th column
max_dist = dist_data[m * col + j]; if (dist_data[m * col + j] < kEPS) {
continue;
}
if (dist_data[m * col + j] > max_dist) {
max_idx = j;
max_row_idx = m;
max_dist = dist_data[m * col + j];
}
} }
} }
} if (max_idx == -1) {
if (max_idx == -1) { // Cannot find good match.
// Cannot find good match. break;
break; } else {
} else { PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
PADDLE_ENFORCE_EQ(match_indices[max_idx], -1); match_indices[max_idx] = max_row_idx;
match_indices[max_idx] = max_row_idx; match_dist[max_idx] = max_dist;
match_dist[max_idx] = max_dist; // Erase the row index.
// Erase the row index. row_pool.erase(
row_pool.erase( std::find(row_pool.begin(), row_pool.end(), max_row_idx));
std::find(row_pool.begin(), row_pool.end(), max_row_idx)); }
} }
} }
} }
......
...@@ -55,26 +55,24 @@ class BRPCClient : public RPCClient { ...@@ -55,26 +55,24 @@ class BRPCClient : public RPCClient {
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncPrefetchVar(const std::string& ep, bool AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& in_var_name, const std::string& in_var_name,
const std::string& out_var_name, const std::string& out_var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBatchBarrier( void AsyncSendBatchBarrier(const std::string& ep,
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
int64_t time_out = RPCClient::rpc_time_out) override;
void AsyncSendFetchBarrier( void AsyncSendFetchBarrier(const std::string& ep,
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
int64_t time_out = RPCClient::rpc_time_out) override;
void Wait() override; void Wait() override;
......
...@@ -239,6 +239,23 @@ void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { ...@@ -239,6 +239,23 @@ void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
req_count_++; req_count_++;
} }
void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir,
int64_t time_out) {
const auto ch = GetChannel(ep);
CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
s->Prepare(time_out);
sendrecv::VariableMessage req;
req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
}
void GRPCClient::Wait() { void GRPCClient::Wait() {
std::unique_lock<std::mutex> lk(sync_mutex_); std::unique_lock<std::mutex> lk(sync_mutex_);
sync_cond_.wait(lk, [this] { return req_count_ == 0; }); sync_cond_.wait(lk, [this] { return req_count_ == 0; });
...@@ -269,14 +286,15 @@ void GRPCClient::Proceed() { ...@@ -269,14 +286,15 @@ void GRPCClient::Proceed() {
} }
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
// TODO(Yancey1989): make grpc client completely thread-safe
std::lock_guard<std::mutex> guard(chan_mutex_); std::lock_guard<std::mutex> guard(chan_mutex_);
auto it = channels_.find(ep); auto it = channels_.find(ep);
if (it != channels_.end()) { if (it != channels_.end()) {
return it->second; return it->second;
} }
// Channel configurations:
grpc::ChannelArguments args; grpc::ChannelArguments args;
args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
args.SetMaxSendMessageSize(std::numeric_limits<int>::max()); args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max()); args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
......
...@@ -76,6 +76,7 @@ class BaseProcessor { ...@@ -76,6 +76,7 @@ class BaseProcessor {
virtual void Prepare(const VarHandle& var_info, int64_t time_out) { virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
context_.reset(new grpc::ClientContext()); context_.reset(new grpc::ClientContext());
var_h_ = var_info; var_h_ = var_info;
context_->set_wait_for_ready(true);
std::chrono::system_clock::time_point deadline = std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
...@@ -85,6 +86,7 @@ class BaseProcessor { ...@@ -85,6 +86,7 @@ class BaseProcessor {
virtual void Prepare(int64_t time_out) { virtual void Prepare(int64_t time_out) {
context_.reset(new grpc::ClientContext()); context_.reset(new grpc::ClientContext());
context_->set_wait_for_ready(true);
std::chrono::system_clock::time_point deadline = std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
...@@ -169,6 +171,20 @@ class FetchBarrierProcessor : public BaseProcessor { ...@@ -169,6 +171,20 @@ class FetchBarrierProcessor : public BaseProcessor {
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_; std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
class CheckpointNotifyProcessor : public BaseProcessor {
public:
explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~CheckpointNotifyProcessor() {}
virtual void Process() {}
sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
};
class GRPCClient : public RPCClient { class GRPCClient : public RPCClient {
public: public:
GRPCClient() {} GRPCClient() {}
...@@ -176,26 +192,27 @@ class GRPCClient : public RPCClient { ...@@ -176,26 +192,27 @@ class GRPCClient : public RPCClient {
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncPrefetchVar(const std::string& ep, bool AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& in_var_name, const std::string& in_var_name,
const std::string& out_var_name, const std::string& out_var_name,
int64_t time_out = RPCClient::rpc_time_out) override; int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBatchBarrier( void AsyncSendFetchBarrier(const std::string& ep,
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
int64_t time_out = RPCClient::rpc_time_out) override;
void AsyncSendFetchBarrier( void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
int64_t time_out = RPCClient::rpc_time_out) override;
void Wait() override; void Wait() override;
...@@ -211,7 +228,7 @@ class GRPCClient : public RPCClient { ...@@ -211,7 +228,7 @@ class GRPCClient : public RPCClient {
void Proceed(); void Proceed();
void AsyncSendComplete(const std::string& ep, void AsyncSendComplete(const std::string& ep,
int64_t time_out = RPCClient::rpc_time_out); int64_t time_out = FLAGS_rpc_deadline);
std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep); std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
......
...@@ -97,7 +97,7 @@ class RequestSend final : public RequestBase { ...@@ -97,7 +97,7 @@ class RequestSend final : public RequestBase {
void Process() override { void Process() override {
std::string varname = GetReqName(); std::string varname = GetReqName();
VLOG(3) << "RequestSend var_name:" << varname; VLOG(4) << "RequestSend var_name:" << varname;
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = request_->GetVar(); auto invar = request_->GetVar();
...@@ -132,7 +132,7 @@ class RequestGet final : public RequestBase { ...@@ -132,7 +132,7 @@ class RequestGet final : public RequestBase {
void Process() override { void Process() override {
// proc request. // proc request.
std::string varname = request_.varname(); std::string varname = request_.varname();
VLOG(3) << "RequestGet " << varname; VLOG(4) << "RequestGet " << varname;
auto scope = request_handler_->scope(); auto scope = request_handler_->scope();
auto invar = scope->FindVar(varname); auto invar = scope->FindVar(varname);
...@@ -178,7 +178,7 @@ class RequestPrefetch final : public RequestBase { ...@@ -178,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
// prefetch process... // prefetch process...
std::string in_var_name = request_->Varname(); std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname(); std::string out_var_name = request_->OutVarname();
VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name; << " out_var_name: " << out_var_name;
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
...@@ -200,11 +200,50 @@ class RequestPrefetch final : public RequestBase { ...@@ -200,11 +200,50 @@ class RequestPrefetch final : public RequestBase {
framework::Scope* local_scope_; framework::Scope* local_scope_;
}; };
class RequestCheckpointNotify final : public RequestBase {
public:
explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx()));
int method_id =
static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
}
virtual ~RequestCheckpointNotify() {}
std::string GetReqName() override { return request_->Varname(); }
void Process() override {
auto scope = request_->GetMutableLocalScope();
std::string checkpoint_notify = request_->Varname();
std::string checkpoint_dir = request_->OutVarname();
VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
<< ", dir: " << checkpoint_dir;
request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
checkpoint_dir);
Finish(reply_, &responder_);
}
protected:
std::shared_ptr<VariableResponse> request_;
sendrecv::VoidMessage reply_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};
void AsyncGRPCServer::WaitServerReady() { void AsyncGRPCServer::WaitServerReady() {
VLOG(3) << "AsyncGRPCServer is wait server ready"; VLOG(4) << "AsyncGRPCServer is wait server ready";
std::unique_lock<std::mutex> lock(this->mutex_ready_); std::unique_lock<std::mutex> lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
VLOG(3) << "AsyncGRPCServer WaitSeverReady"; VLOG(4) << "AsyncGRPCServer WaitSeverReady";
} }
void AsyncGRPCServer::StartServer() { void AsyncGRPCServer::StartServer() {
...@@ -237,13 +276,14 @@ void AsyncGRPCServer::StartServer() { ...@@ -237,13 +276,14 @@ void AsyncGRPCServer::StartServer() {
reqs.reserve(kRequestBufSize); reqs.reserve(kRequestBufSize);
for (int i = 0; i < kRequestBufSize; i++) { for (int i = 0; i < kRequestBufSize; i++) {
VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
TryToRegisterNewOne(rpc_name, i); TryToRegisterNewOne(rpc_name, i);
} }
for (int i = 0; i < threadnum; i++) { for (int i = 0; i < threadnum; i++) {
rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
&AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
VLOG(3) << t.first << " creates threads!"; VLOG(4) << t.first << " creates threads!";
} }
} }
...@@ -260,7 +300,7 @@ void AsyncGRPCServer::StartServer() { ...@@ -260,7 +300,7 @@ void AsyncGRPCServer::StartServer() {
auto& threads = t.second; auto& threads = t.second;
for (size_t i = 0; i < threads.size(); ++i) { for (size_t i = 0; i < threads.size(); ++i) {
threads[i]->join(); threads[i]->join();
VLOG(3) << t.first << " threads ends!"; VLOG(4) << t.first << " threads ends!";
} }
} }
} }
...@@ -268,7 +308,7 @@ void AsyncGRPCServer::StartServer() { ...@@ -268,7 +308,7 @@ void AsyncGRPCServer::StartServer() {
void AsyncGRPCServer::ShutdownQueue() { void AsyncGRPCServer::ShutdownQueue() {
for (auto& t : rpc_cq_) { for (auto& t : rpc_cq_) {
t.second->Shutdown(); t.second->Shutdown();
VLOG(3) << t.first << " shutdown!"; VLOG(4) << t.first << " queue shutdown!";
} }
} }
...@@ -277,7 +317,7 @@ void AsyncGRPCServer::ShutDownImpl() { ...@@ -277,7 +317,7 @@ void AsyncGRPCServer::ShutDownImpl() {
is_shut_down_ = true; is_shut_down_ = true;
ShutdownQueue(); ShutdownQueue();
VLOG(3) << "server_ shutdown!"; VLOG(4) << "server_ shutdown!";
server_->Shutdown(); server_->Shutdown();
} }
...@@ -285,12 +325,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -285,12 +325,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
int req_id) { int req_id) {
std::unique_lock<std::mutex> lock(cq_mutex_); std::unique_lock<std::mutex> lock(cq_mutex_);
if (is_shut_down_) { if (is_shut_down_) {
LOG(WARNING) << "shutdown, do not TryToRegisterNewSendOne"; VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
return; return;
} }
VLOG(4) << "register send rpc_name:" << rpc_name VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
<< ", handler:" << rpc_call_map_[kRequestSend]; << " REQ ID: " << req_id;
auto& reqs = rpc_reqs_[rpc_name]; auto& reqs = rpc_reqs_[rpc_name];
auto& handler = rpc_call_map_[rpc_name]; auto& handler = rpc_call_map_[rpc_name];
...@@ -303,6 +343,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, ...@@ -303,6 +343,8 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
b = new RequestGet(&service_, cq.get(), handler, req_id); b = new RequestGet(&service_, cq.get(), handler, req_id);
} else if (rpc_name == kRequestPrefetch) { } else if (rpc_name == kRequestPrefetch) {
b = new RequestPrefetch(&service_, cq.get(), handler, req_id); b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
} else if (rpc_name == kRequestCheckpoint) {
b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id);
} else { } else {
PADDLE_ENFORCE(false, "not supported rpc"); PADDLE_ENFORCE(false, "not supported rpc");
} }
...@@ -321,7 +363,7 @@ void AsyncGRPCServer::HandleRequest( ...@@ -321,7 +363,7 @@ void AsyncGRPCServer::HandleRequest(
while (true) { while (true) {
VLOG(4) << "HandleRequest " << rpc_name << " wait next"; VLOG(4) << "HandleRequest " << rpc_name << " wait next";
if (!cq->Next(&tag, &ok)) { if (!cq->Next(&tag, &ok)) {
LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!"; VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
break; break;
} }
......
...@@ -80,10 +80,11 @@ enum class GrpcMethod { ...@@ -80,10 +80,11 @@ enum class GrpcMethod {
kSendVariable, kSendVariable,
kGetVariable, kGetVariable,
kPrefetchVariable, kPrefetchVariable,
kCheckpointNotify,
}; };
static const int kGrpcNumMethods = static const int kGrpcNumMethods =
static_cast<int>(GrpcMethod::kPrefetchVariable) + 1; static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
inline const char* GrpcMethodName(GrpcMethod id) { inline const char* GrpcMethodName(GrpcMethod id) {
switch (id) { switch (id) {
...@@ -93,6 +94,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { ...@@ -93,6 +94,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
return "/sendrecv.SendRecvService/GetVariable"; return "/sendrecv.SendRecvService/GetVariable";
case GrpcMethod::kPrefetchVariable: case GrpcMethod::kPrefetchVariable:
return "/sendrecv.SendRecvService/PrefetchVariable"; return "/sendrecv.SendRecvService/PrefetchVariable";
case GrpcMethod::kCheckpointNotify:
return "/sendrecv.SendRecvService/CheckpointNotify";
} }
// Shouldn't be reached. // Shouldn't be reached.
......
...@@ -36,12 +36,16 @@ namespace distributed { ...@@ -36,12 +36,16 @@ namespace distributed {
constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestSend[] = "RequestSend";
constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestGet[] = "RequestGet";
constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestPrefetch[] = "RequestPrefetch";
constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define COMPLETE_MESSAGE "COMPLETE@RECV" #define COMPLETE_MESSAGE "COMPLETE@RECV"
#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
class RPCServer; class RPCServer;
class RequestHandler { class RequestHandler {
...@@ -69,6 +73,11 @@ class RequestHandler { ...@@ -69,6 +73,11 @@ class RequestHandler {
prefetch_var_name_to_prepared_ctx_ = g; prefetch_var_name_to_prepared_ctx_ = g;
} }
void SetCheckpointNotifyPreparedCtx(
std::shared_ptr<framework::ExecutorPrepareContext> g) {
checkpoint_prepared_ctx_ = g;
}
// Used for async. // Used for async.
void SetGradToPreparedCtx( void SetGradToPreparedCtx(
std::unordered_map< std::unordered_map<
...@@ -115,6 +124,8 @@ class RequestHandler { ...@@ -115,6 +124,8 @@ class RequestHandler {
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>* std::shared_ptr<framework::ExecutorPrepareContext>>*
prefetch_var_name_to_prepared_ctx_; prefetch_var_name_to_prepared_ctx_;
// used for checkpoint notify
std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
// Used for async. // Used for async.
std::unordered_map<std::string, std::unordered_map<std::string,
......
...@@ -22,11 +22,16 @@ ...@@ -22,11 +22,16 @@
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
bool RequestSendHandler::Handle(const std::string& varname, bool RequestSendHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
...@@ -119,6 +124,24 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, ...@@ -119,6 +124,24 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
return true; return true;
} }
bool RequestCheckpointHandler::Handle(const std::string& varname,
framework::Scope* scope,
framework::Variable* invar,
framework::Variable** outvar,
const std::string& out_var_name) {
PADDLE_ENFORCE(
checkpoint_notify_id != -1,
"when checkpoint_notify_id = -1, there should be no RPC invoke.");
auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
lt_var->clear();
lt_var->append(out_var_name);
VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
<< out_var_name;
executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
return true;
}
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -66,6 +66,21 @@ class RequestPrefetchHandler final : public RequestHandler { ...@@ -66,6 +66,21 @@ class RequestPrefetchHandler final : public RequestHandler {
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
}; };
class RequestCheckpointHandler final : public RequestHandler {
public:
explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id)
: RequestHandler(sync_mode) {
this->checkpoint_notify_id = checkpoint_notify_id;
}
virtual ~RequestCheckpointHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const std::string& out_var_name = "") override;
private:
int checkpoint_notify_id;
};
} // namespace distributed } // namespace distributed
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,10 @@ ...@@ -13,6 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_client.h"
#include "gflags/gflags.h"
// default to 3min to avoid temprary network failures.
DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,11 +15,14 @@ ...@@ -15,11 +15,14 @@
#pragma once #pragma once
#include <string> #include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
DECLARE_int32(rpc_deadline);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
...@@ -32,26 +35,30 @@ class RPCClient { ...@@ -32,26 +35,30 @@ class RPCClient {
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out = rpc_time_out) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual bool AsyncGetVar(const std::string& ep, virtual bool AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out = rpc_time_out) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual bool AsyncPrefetchVar(const std::string& ep, virtual bool AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& in_var_name, const std::string& in_var_name,
const std::string& out_var_name, const std::string& out_var_name,
int64_t time_out = rpc_time_out) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendBatchBarrier(const std::string& ep, virtual void AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = rpc_time_out) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncSendFetchBarrier(const std::string& ep, virtual void AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out = rpc_time_out) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual void AsyncCheckpointNotify(const std::string& ep,
const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) = 0;
// SendComplete tells all the server that current trainer have no more data // SendComplete tells all the server that current trainer have no more data
// to train, so that the pserver can reduce it's barrier count, and continue // to train, so that the pserver can reduce it's barrier count, and continue
...@@ -60,8 +67,6 @@ class RPCClient { ...@@ -60,8 +67,6 @@ class RPCClient {
virtual void Wait() = 0; virtual void Wait() = 0;
static constexpr int64_t rpc_time_out = 120 * 1000;
template <typename T> template <typename T>
static RPCClient* GetInstance() { static RPCClient* GetInstance() {
std::call_once(init_flag_, &RPCClient::Init<T>); std::call_once(init_flag_, &RPCClient::Init<T>);
......
...@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) { ...@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load()); return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
}); });
VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name]; VLOG(3) << "batch_barrier_: " << rpc_name << " "
<< barrier_counter_[rpc_name];
} }
void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
int b = 0; int b = 0;
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
b = ++barrier_counter_[rpc_name]; b = ++barrier_counter_[rpc_name];
...@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) { ...@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
} }
void RPCServer::WaitCond(const std::string& rpc_name) { void RPCServer::WaitCond(const std::string& rpc_name) {
VLOG(3) << "RPCServer WaitCond " << rpc_name; VLOG(4) << "RPCServer WaitCond " << rpc_name;
int cond = 0; int cond = 0;
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
......
...@@ -25,6 +25,8 @@ service SendRecvService { ...@@ -25,6 +25,8 @@ service SendRecvService {
rpc GetVariable(VariableMessage) returns (VariableMessage) {} rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// pre-fetch variable by given variable name and Ids // pre-fetch variable by given variable name and Ids
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
} }
// VariableMessage is serialized paddle variable message. // VariableMessage is serialized paddle variable message.
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory;
using mkldnn::reorder;
using mkldnn::primitive;
using mkldnn::stream;
using mkldnn::sum;
template <typename T>
class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
const T* x_data = x->data<T>();
const T* y_data = y->data<T>();
T* z_data = z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
auto z_dims = z->dims();
// Execute default elementwise_add operator when
// broadcast operations need to performed.
if (x_dims != y_dims) {
auto sum_func = [](T a, T b) -> T { return a + b; };
TransformFunctor<decltype(sum_func), T,
paddle::platform::CPUDeviceContext, T>
functor(
x, y, z,
ctx.template device_context<paddle::platform::CPUDeviceContext>(),
sum_func);
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) {
functor.RunRowWise(n, pre);
} else {
functor.RunMidWise(n, pre, post);
}
z->set_layout(DataLayout::kMKLDNN);
z->set_format(x->format());
} else {
PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
x->format() != memory::format::format_undef,
"Wrong layout/format set for X tensor");
PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
y->format() != memory::format::format_undef,
"Wrong layout/format set for X tensor");
std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
std::vector<int> dst_tz = framework::vectorize2int(z_dims);
std::vector<memory::primitive_desc> srcs_pd;
std::vector<memory> srcs;
std::vector<float> scales = {1.0f, 1.0f};
auto src_x_pd = memory::primitive_desc(
{{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
auto src_y_pd = memory::primitive_desc(
{{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
auto src_x_memory =
memory(src_x_pd, paddle::platform::to_void_cast(x_data));
auto src_y_memory =
memory(src_y_pd, paddle::platform::to_void_cast(y_data));
srcs_pd.push_back(src_x_pd);
srcs_pd.push_back(src_y_pd);
srcs.push_back(src_x_memory);
srcs.push_back(src_y_memory);
auto dst_md =
memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
// create primitive descriptor for sum
auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
// create mkldnn memory for dst
memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
std::vector<primitive::at> inputs;
inputs.push_back(srcs[0]);
inputs.push_back(srcs[1]);
// create sum primitive
auto sum_prim = sum(sum_pd, inputs, dst_memory);
std::vector<primitive> pipeline;
pipeline.push_back(sum_prim);
stream(stream::kind::eager).submit(pipeline).wait();
z->set_layout(DataLayout::kMKLDNN);
z->set_format(
(memory::format)dst_memory.get_primitive_desc().desc().data.format);
}
}
};
template <typename T>
class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Input<Tensor>("Out");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
in->set_layout(DataLayout::kMKLDNN);
in->set_format(out->format());
};
if (x->dims() == y->dims()) {
auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
if (dx) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dx->mutable_data<T>(ctx.GetPlace()));
set_mkldnn_format(dx, dout);
}
if (dy) {
blas.VCOPY(dout->numel(), dout->data<T>(),
dy->mutable_data<T>(ctx.GetPlace()));
set_mkldnn_format(dy, dout);
}
} else {
// Execute default kernel when broadcast is needed
ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
IdentityGrad<T>, IdentityGrad<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
IdentityGrad<T>());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseAddMKLDNNKernel<float>)
REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseAddMKLDNNGradKernel<float>)
...@@ -14,8 +14,12 @@ limitations under the License. */ ...@@ -14,8 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", x_dim); ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
#ifdef PADDLE_WITH_MKLDNN
if (platform::CanMKLDNNBeUsed(ctx)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
class ElementwiseOpInferVarType : public framework::VarTypeInference { class ElementwiseOpInferVarType : public framework::VarTypeInference {
...@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
"for broadcasting Y onto X.") "for broadcasting Y onto X.")
.SetDefault(-1) .SetDefault(-1)
.EqualGreaterThan(-1); .EqualGreaterThan(-1);
AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC( AddComment(string::Sprintf(R"DOC(
Limited Elementwise %s Operator Limited Elementwise %s Operator
...@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(y_grad_name, y_dims); ctx->SetOutputDim(y_grad_name, y_dims);
} }
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
#ifdef PADDLE_WITH_MKLDNN
if (platform::CanMKLDNNBeUsed(ctx)) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -99,7 +99,8 @@ static int64_t GetTimestamp() { ...@@ -99,7 +99,8 @@ static int64_t GetTimestamp() {
void ListenAndServOp::RunSyncLoop( void ListenAndServOp::RunSyncLoop(
framework::Executor *executor, framework::ProgramDesc *program, framework::Executor *executor, framework::ProgramDesc *program,
framework::Scope *recv_scope, framework::Scope *recv_scope,
const std::vector<int> &prefetch_block_id_list) const { const std::vector<int> &prefetch_block_id_list,
const int checkpoint_point_block_id) const {
size_t num_blocks = program->Size(); size_t num_blocks = program->Size();
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
...@@ -163,8 +164,8 @@ void ListenAndServOp::RunSyncLoop( ...@@ -163,8 +164,8 @@ void ListenAndServOp::RunSyncLoop(
} }
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program) const { framework::ProgramDesc *program,
VLOG(3) << "RunAsyncLoop in"; framework::Scope *recv_scope) const {
// grad name to block id // grad name to block id
std::unordered_map<std::string, int32_t> grad_to_block_id; std::unordered_map<std::string, int32_t> grad_to_block_id;
std::unordered_map<int32_t, std::string> id_to_grad; std::unordered_map<int32_t, std::string> id_to_grad;
...@@ -191,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -191,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
block_list.push_back(blkid); block_list.push_back(blkid);
} }
auto optimize_prepared = executor->Prepare(*program, block_list); auto optimize_prepared = executor->Prepare(*program, block_list);
// execute global block if needed
if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
}
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
grad_to_prepared_ctx; grad_to_prepared_ctx;
...@@ -202,10 +207,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, ...@@ -202,10 +207,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx); request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
VLOG(3) << "RunAsyncLoop into while";
while (true) { while (true) {
if (rpc_service_->IsExit()) { if (rpc_service_->IsExit()) {
LOG(INFO) << "get exit!rpc_processor break!"; VLOG(4) << "get exit!rpc_processor break!";
break; break;
} }
...@@ -220,6 +224,7 @@ static void FillRequestCtx( ...@@ -220,6 +224,7 @@ static void FillRequestCtx(
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx, *prefetch_ctx,
std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
distributed::RPCServer *rpc_server) { distributed::RPCServer *rpc_server) {
h->SetScope(scope); h->SetScope(scope);
h->SetDevCtx(dev_ctx); h->SetDevCtx(dev_ctx);
...@@ -227,6 +232,7 @@ static void FillRequestCtx( ...@@ -227,6 +232,7 @@ static void FillRequestCtx(
h->SetProgram(program); h->SetProgram(program);
h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetPrefetchPreparedCtx(prefetch_ctx);
h->SetRPCServer(rpc_server); h->SetRPCServer(rpc_server);
h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
} }
void ListenAndServOp::RunImpl(const framework::Scope &scope, void ListenAndServOp::RunImpl(const framework::Scope &scope,
...@@ -242,9 +248,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -242,9 +248,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
PADDLE_ENFORCE(!rpc_service_); PADDLE_ENFORCE(!rpc_service_);
std::string endpoint = Attr<std::string>("endpoint"); std::string endpoint = Attr<std::string>("endpoint");
int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
<< ", end_point:" << endpoint; << ", end_point:" << endpoint
<< ", checkpoint_block_id: " << checkpoint_block_id;
rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
...@@ -252,6 +260,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -252,6 +260,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode)); request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
request_prefetch_handler_.reset( request_prefetch_handler_.reset(
new distributed::RequestPrefetchHandler(sync_mode)); new distributed::RequestPrefetchHandler(sync_mode));
request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
sync_mode, checkpoint_block_id));
rpc_service_->RegisterRPC(distributed::kRequestSend, rpc_service_->RegisterRPC(distributed::kRequestSend,
request_send_handler_.get()); request_send_handler_.get());
...@@ -259,6 +269,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -259,6 +269,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
request_get_handler_.get()); request_get_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestPrefetch, rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
request_prefetch_handler_.get()); request_prefetch_handler_.get());
rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
request_checkpoint_handler_.get());
auto optimize_blocks = auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks); Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
...@@ -267,6 +279,13 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -267,6 +279,13 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto *program = optimize_blocks[0]->Program(); auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
if (checkpoint_block_id != -1) {
auto ctx = executor.Prepare(*program, checkpoint_block_id);
// see: https://stackoverflow.com/a/14856553
ckpt_pre_context = std::move(ctx);
}
// prepare for prefetch // prepare for prefetch
std::vector<int> prefetch_block_id_list; std::vector<int> prefetch_block_id_list;
std::unordered_map<int, std::string> block_id_to_prefetch_var_name; std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
...@@ -297,13 +316,15 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -297,13 +316,15 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
} }
auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, auto f =
&dev_ctx, &executor, program, std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
&prefetch_var_name_to_prepared_ctx, rpc_service_.get()); &executor, program, &prefetch_var_name_to_prepared_ctx,
ckpt_pre_context, rpc_service_.get());
f(request_send_handler_.get()); f(request_send_handler_.get());
f(request_get_handler_.get()); f(request_get_handler_.get());
f(request_prefetch_handler_.get()); f(request_prefetch_handler_.get());
f(request_checkpoint_handler_.get());
// start the server listening after all member initialized. // start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_)); server_thread_.reset(new std::thread(RunServer, rpc_service_));
...@@ -317,9 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -317,9 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// Write to a file of server selected port for python use. // Write to a file of server selected port for python use.
SavePort(); SavePort();
if (sync_mode) { if (sync_mode) {
RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list); RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
checkpoint_block_id);
} else { } else {
RunAsyncLoop(&executor, program); RunAsyncLoop(&executor, program, &recv_scope);
} }
} }
...@@ -349,6 +371,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -349,6 +371,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault({}); .SetDefault({});
AddAttr<int>("Fanin", "How many clients send to this server.") AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1); .SetDefault(1);
AddAttr<int>(kCheckpointBlockId,
"BolckID to run save checkpoint on pserer.")
.SetDefault(-1);
} }
}; };
......
...@@ -32,6 +32,7 @@ namespace operators { ...@@ -32,6 +32,7 @@ namespace operators {
constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kOptimizeBlocks[] = "optimize_blocks";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
constexpr char kCheckpointBlockId[] = "checkpint_block_id";
void RunServer(std::shared_ptr<distributed::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
...@@ -47,10 +48,12 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -47,10 +48,12 @@ class ListenAndServOp : public framework::OperatorBase {
void RunSyncLoop(framework::Executor* executor, void RunSyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program,
framework::Scope* recv_scope, framework::Scope* recv_scope,
const std::vector<int>& prefetch_block_id_list) const; const std::vector<int>& prefetch_block_id_list,
const int checkpoint_point_block_id) const;
void RunAsyncLoop(framework::Executor* executor, void RunAsyncLoop(framework::Executor* executor,
framework::ProgramDesc* program) const; framework::ProgramDesc* program,
framework::Scope* recv_scope) const;
void SavePort() const; void SavePort() const;
...@@ -67,6 +70,8 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -67,6 +70,8 @@ class ListenAndServOp : public framework::OperatorBase {
mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
mutable std::shared_ptr<distributed::RequestHandler> mutable std::shared_ptr<distributed::RequestHandler>
request_prefetch_handler_; request_prefetch_handler_;
mutable std::shared_ptr<distributed::RequestHandler>
request_checkpoint_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
}; };
......
...@@ -34,6 +34,8 @@ class LoadOp : public framework::OperatorBase { ...@@ -34,6 +34,8 @@ class LoadOp : public framework::OperatorBase {
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx); platform::RecordEvent record_event(Type(), dev_ctx);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename); std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op", PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
...@@ -44,9 +46,25 @@ class LoadOp : public framework::OperatorBase { ...@@ -44,9 +46,25 @@ class LoadOp : public framework::OperatorBase {
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_name); out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); if (out_var->IsType<framework::LoDTensor>()) {
LoadLodTensor(fin, place, out_var);
} else if (out_var->IsType<framework::SelectedRows>()) {
LoadSelectedRows(fin, place, out_var);
} else {
PADDLE_ENFORCE(
false,
"Load only support LoDTensor and SelectedRows, %s has wrong type",
out_var_name);
}
}
DeserializeFromStream(fin, tensor, *dev_ctx); void LoadLodTensor(std::istream &fin, const platform::Place &place,
framework::Variable *var) const {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *tensor = var->GetMutable<framework::LoDTensor>();
DeserializeFromStream(fin, tensor, dev_ctx);
auto load_as_fp16 = Attr<bool>("load_as_fp16"); auto load_as_fp16 = Attr<bool>("load_as_fp16");
auto in_dtype = framework::ToDataType(tensor->type()); auto in_dtype = framework::ToDataType(tensor->type());
...@@ -63,18 +81,27 @@ class LoadOp : public framework::OperatorBase { ...@@ -63,18 +81,27 @@ class LoadOp : public framework::OperatorBase {
&fp16_tensor); &fp16_tensor);
// reset output tensor // reset output tensor
out_var->Clear(); var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = var->GetMutable<framework::LoDTensor>();
tensor->set_lod(fp16_tensor.lod()); tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor); tensor->ShareDataWith(fp16_tensor);
} }
} }
void LoadSelectedRows(std::istream &fin, const platform::Place &place,
framework::Variable *var) const {
auto *selectedRows = var->GetMutable<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
}
}; };
class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddOutput("Out", "The tensor need to be loaded"); AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
AddAttr<bool>( AddAttr<bool>(
"load_as_fp16", "load_as_fp16",
"If true, the tensor will be first loaded and then " "If true, the tensor will be first loaded and then "
...@@ -85,7 +112,9 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -85,7 +112,9 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
R"(Variable will be loaded from "file_path")") R"(Variable will be loaded from "file_path")")
.AddCustomChecker( .AddCustomChecker(
[](const std::string &path) { return !path.empty(); }); [](const std::string &path) { return !path.empty(); });
AddComment("Load operator will load a tensor variable from disk file."); AddComment(
"Load operator will load a LoDTensor / SelectedRows variable from disk "
"file.");
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -18,10 +18,7 @@ ...@@ -18,10 +18,7 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include "paddle/fluid/platform/dynload/mklml.h"
#include <mkl_lapacke.h>
#include <mkl_service.h>
#include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
...@@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) { ...@@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) {
openblas_set_num_threads(real_num_threads); openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML) #elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1; int real_num_threads = num_threads > 1 ? num_threads : 1;
mkl_set_num_threads(real_num_threads); platform::dynload::MKL_Set_Num_Threads(real_num_threads);
#else #else
PADDLE_ENFORCE(false, "To be implemented."); PADDLE_ENFORCE(false, "To be implemented.");
#endif #endif
......
...@@ -22,61 +22,109 @@ namespace math { ...@@ -22,61 +22,109 @@ namespace math {
template <typename T> template <typename T>
struct CBlas; struct CBlas;
#ifdef PADDLE_WITH_MKLML
template <> template <>
struct CBlas<float> { struct CBlas<float> {
template <typename... ARGS> template <typename... ARGS>
static void GEMM(ARGS... args) { static void GEMM(ARGS... args) {
cblas_sgemm(args...); platform::dynload::cblas_sgemm(args...);
} }
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
cblas_saxpy(args...); platform::dynload::cblas_saxpy(args...);
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
platform::dynload::cblas_scopy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
platform::dynload::cblas_sgemv(args...);
}
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
platform::dynload::cblas_sgemm_batch(args...);
} }
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS> template <typename... ARGS>
static void VADD(ARGS... args) { static void VADD(ARGS... args) {
vsAdd(args...); platform::dynload::vsAdd(args...);
}
};
template <>
struct CBlas<double> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
platform::dynload::cblas_dgemm(args...);
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
platform::dynload::cblas_daxpy(args...);
} }
#endif
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
cblas_scopy(args...); platform::dynload::cblas_dcopy(args...);
} }
template <typename... ARGS> template <typename... ARGS>
static void GEMV(ARGS... args) { static void GEMV(ARGS... args) {
cblas_sgemv(args...); platform::dynload::cblas_dgemv(args...);
} }
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS> template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) { static void GEMM_BATCH(ARGS... args) {
cblas_sgemm_batch(args...); platform::dynload::cblas_dgemm_batch(args...);
}
template <typename... ARGS>
static void VADD(ARGS... args) {
platform::dynload::vdAdd(args...);
} }
#endif
}; };
#else
template <> template <>
struct CBlas<double> { struct CBlas<float> {
template <typename... ARGS> template <typename... ARGS>
static void GEMM(ARGS... args) { static void GEMM(ARGS... args) {
cblas_dgemm(args...); cblas_sgemm(args...);
} }
template <typename... ARGS> template <typename... ARGS>
static void AXPY(ARGS... args) { static void AXPY(ARGS... args) {
cblas_daxpy(args...); cblas_saxpy(args...);
} }
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS> template <typename... ARGS>
static void VADD(ARGS... args) { static void VCOPY(ARGS... args) {
vdAdd(args...); cblas_scopy(args...);
}
template <typename... ARGS>
static void GEMV(ARGS... args) {
cblas_sgemv(args...);
}
};
template <>
struct CBlas<double> {
template <typename... ARGS>
static void GEMM(ARGS... args) {
cblas_dgemm(args...);
}
template <typename... ARGS>
static void AXPY(ARGS... args) {
cblas_daxpy(args...);
} }
#endif
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
...@@ -87,15 +135,8 @@ struct CBlas<double> { ...@@ -87,15 +135,8 @@ struct CBlas<double> {
static void GEMV(ARGS... args) { static void GEMV(ARGS... args) {
cblas_dgemv(args...); cblas_dgemv(args...);
} }
#ifdef PADDLE_WITH_MKLML
template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) {
cblas_dgemm_batch(args...);
}
#endif
}; };
#endif
template <> template <>
struct CBlas<platform::float16> { struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
......
...@@ -14,9 +14,7 @@ limitations under the License. */ ...@@ -14,9 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include "paddle/fluid/platform/dynload/mklml.h"
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
......
...@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("SeedOut", "The random seed after random cropping.") AddOutput("SeedOut", "The random seed after random cropping.")
.AsIntermediate(); .AsIntermediate();
AddAttr<std::vector<int>>("shape", "The shape of a cropped instance."); AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
AddAttr<int>("startup_seed",
"If the input 'Seed' is not initialized, the 'startup_seed' "
"will be used to replace it. Even so, the seed after random "
"crop will also be outputed to the 'SeedOut'.")
.SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
This operator takes a batch of instance, and do random cropping on each instance. This operator takes a batch of instance, and do random cropping on each instance.
It means that cropping positions differs on each instance, which is determined It means that cropping positions differs on each instance, which is determined
...@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
class RandomCropOpInferShape : public framework::InferShapeBase { class RandomCropOpInferShape : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext* ctx) const override { void operator()(framework::InferShapeContext* ctx) const override {
auto seed_dim = ctx->GetInputDim("Seed");
PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
auto shape = ctx->Attrs().Get<std::vector<int>>("shape"); auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
auto x_dim = ctx->GetInputDim("X"); auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size())); PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
...@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase { ...@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
out_dim[x_i] = shape[shape_i]; out_dim[x_i] = shape[shape_i];
} }
ctx->SetOutputDim("Out", framework::make_ddim(out_dim)); ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
} }
}; };
......
...@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T> ...@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T>
class RandomCropKernel : public framework::OpKernel<T> { class RandomCropKernel : public framework::OpKernel<T> {
public: public:
virtual void Compute(const framework::ExecutionContext& ctx) const { virtual void Compute(const framework::ExecutionContext& ctx) const {
auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
int64_t seed = 0; int64_t seed = 0;
if (platform::is_cpu_place(seed_tensor.place())) { auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
seed = *seed_tensor.data<int64_t>(); if (seed_tensor.IsInitialized()) {
if (platform::is_cpu_place(seed_tensor.place())) {
seed = *seed_tensor.data<int64_t>();
} else {
LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
"your program";
framework::LoDTensor cpu_seed;
framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
seed = *cpu_seed.data<int64_t>();
}
} else { } else {
LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify " VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
"your program"; "'startup_seed' instead.";
framework::LoDTensor cpu_seed; seed = ctx.Attr<int>("startup_seed");
framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
seed = *cpu_seed.data<int64_t>();
} }
auto shape = ctx.Attr<std::vector<int>>("shape"); auto shape = ctx.Attr<std::vector<int>>("shape");
auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X")); auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
...@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> { ...@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
engine.discard(functor.prod_batchsize_dims_ * engine.discard(functor.prod_batchsize_dims_ *
(functor.rank_ - functor.num_batchsize_dims_)); (functor.rank_ - functor.num_batchsize_dims_));
*ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>( *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
platform::CPUPlace()) = engine(); framework::make_ddim({1}), platform::CPUPlace()) = engine();
} }
}; };
......
...@@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o ...@@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc) reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
# Export local libraries to parent # Export local libraries to parent
......
...@@ -88,24 +88,29 @@ class BlockingQueue { ...@@ -88,24 +88,29 @@ class BlockingQueue {
receive_cv_.notify_all(); receive_cv_.notify_all();
} }
bool IsClosed() { bool IsClosed() const {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return closed_; return closed_;
} }
size_t Cap() { size_t Cap() const {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
return capacity_; return capacity_;
} }
size_t Size() const {
std::lock_guard<std::mutex> lock(mutex_);
return queue_.size();
}
private: private:
size_t capacity_; size_t capacity_;
bool closed_; bool closed_;
std::deque<T> queue_; std::deque<T> queue_;
std::mutex mutex_; mutable std::mutex mutex_;
std::condition_variable receive_cv_; mutable std::condition_variable receive_cv_;
std::condition_variable send_cv_; mutable std::condition_variable send_cv_;
}; };
} // namespace reader } // namespace reader
} // namespace operators } // namespace operators
......
...@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader { ...@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
const framework::ProgramDesc program_; const framework::ProgramDesc program_;
int sub_block_id_; int sub_block_id_;
framework::Executor exe_; framework::Executor exe_;
framework::Scope scope_;
std::vector<std::string> source_var_names_; std::vector<std::string> source_var_names_;
std::vector<std::string> sink_var_names_; std::vector<std::string> sink_var_names_;
...@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) { ...@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
// The scope for CustomReader's sub-block should be independent and shouldn't // The scope for CustomReader's sub-block should be independent and shouldn't
// be any other computation scope's child. Otherwise, data preprocessing and // be any other computation scope's child. Otherwise, data preprocessing and
// compution cannot be concurrent. // compution cannot be concurrent.
framework::Scope scope; framework::Scope* exe_scope = &scope_.NewScope();
// 1. Copy LoDTensors from underlying reader's output to source variables. // 1. Copy LoDTensors from underlying reader's output to source variables.
for (size_t i = 0; i < source_var_names_.size(); ++i) { for (size_t i = 0; i < source_var_names_.size(); ++i) {
framework::Variable* var = scope.Var(source_var_names_[i]); framework::Variable* var = exe_scope->Var(source_var_names_[i]);
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>(); framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
tensor->ShareDataWith(underlying_outs[i]); tensor->ShareDataWith(underlying_outs[i]);
tensor->set_lod(underlying_outs[i].lod()); tensor->set_lod(underlying_outs[i].lod());
} }
// 2. Run the sub-block. // 2. Run the sub-block.
exe_.Run(program_, &scope, sub_block_id_, false, true); exe_.Run(program_, exe_scope, sub_block_id_, false, true);
// 3. Copy LoDTensors from sink variables to out. // 3. Copy LoDTensors from sink variables to out.
out->resize(sink_var_names_.size()); out->resize(sink_var_names_.size());
for (size_t i = 0; i < sink_var_names_.size(); ++i) { for (size_t i = 0; i < sink_var_names_.size(); ++i) {
const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i])) const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
.Get<framework::LoDTensor>(); .Get<framework::LoDTensor>();
framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]); framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
} }
scope_.DeleteScope(exe_scope);
} }
} // namespace reader } // namespace reader
......
...@@ -23,13 +23,13 @@ namespace reader { ...@@ -23,13 +23,13 @@ namespace reader {
// 'Double buffer' means we shall maintain two batches of input data at the same // 'Double buffer' means we shall maintain two batches of input data at the same
// time. So the kCacheSize shoul be at least 2. // time. So the kCacheSize shoul be at least 2.
static constexpr size_t kCacheSize = 3; static constexpr size_t kCacheSize = 5;
// There will be two bacthes out of the channel during training: // There will be two bacthes out of the channel during training:
// 1. the one waiting to be sent to the channel // 1. the one waiting to be sent to the channel
// 2. the one just be received from the channel, which is also being used by // 2. the one just be received from the channel, which is also being used by
// subsequent operators. // subsequent operators.
// So the channel size should be kChacheSize - 2 // So the channel size should be kChacheSize - 2
static constexpr size_t kChannelSize = 1; // kCacheSize - 2 static constexpr size_t kChannelSize = 3; // kCacheSize - 2
class DoubleBufferReader : public framework::DecoratedReader { class DoubleBufferReader : public framework::DecoratedReader {
public: public:
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class PyReader : public framework::ReaderBase {
public:
explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
queue_ = queue;
}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
bool success;
*out = queue_->Pop(&success);
if (!success) out->clear();
}
void ReInit() override {}
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
class CreatePyReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) return;
const std::string& queue_name = Input("blocking_queue");
auto* queue_holder_var = scope.FindVar(queue_name);
PADDLE_ENFORCE(
queue_holder_var != nullptr,
"No LoDTensorBlockingQueueHolder variable with name %s found",
queue_name);
auto* queue_holder =
queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
out->Reset(new PyReader(queue_holder->GetQueue()));
}
};
class CreatePyReaderOpMaker : public FileReaderMakerBase {
protected:
void Apply() override {
AddInput("blocking_queue",
"Name of the `LoDTensorBlockingQueueHolder` variable");
AddComment(R"DOC(
Create PyReader to support LoDTensor data feeding in Python side.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace reader = ::paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
reader::CreatePyReaderOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace operators {
namespace reader {
class LoDTensorBlockingQueueHolder;
class LoDTensorBlockingQueue {
friend class LoDTensorBlockingQueueHolder;
private:
LoDTensorBlockingQueue(size_t capacity,
const std::vector<framework::DDim>& dims)
: queue_(capacity), dims_(dims) {}
public:
bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(lod_tensor_vec);
}
bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
CheckDims(lod_tensor_vec);
return queue_.Send(std::move(lod_tensor_vec));
}
std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
std::vector<framework::LoDTensor> lod_tensor_vec;
bool success = queue_.Receive(&lod_tensor_vec);
if (ok != nullptr) *ok = success;
return lod_tensor_vec;
}
inline size_t Cap() const { return queue_.Cap(); }
inline size_t Size() const { return queue_.Size(); }
inline void Close() { return queue_.Close(); }
inline bool IsClosed() const { return queue_.IsClosed(); }
private:
void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
"Expect input size is %d but found %s", dims_.size(),
lod_tensor_vec.size());
for (size_t i = 0; i < dims_.size(); ++i) {
const auto& in_dims = framework::slice_ddim(
lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
const auto& expect_dims =
framework::slice_ddim(dims_[i], 1, dims_[i].size());
PADDLE_ENFORCE(in_dims == expect_dims,
"Dims of the %d-th input tensor do not match", i);
}
}
BlockingQueue<std::vector<framework::LoDTensor>> queue_;
std::vector<framework::DDim> dims_;
};
class LoDTensorBlockingQueueHolder {
public:
void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
PADDLE_ENFORCE(
queue_ == nullptr,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
}
inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
return queue_;
}
private:
std::shared_ptr<LoDTensorBlockingQueue> queue_;
};
} // namespace reader
} // namespace operators
} // namespace paddle
...@@ -139,6 +139,7 @@ TEST(LoadFP16Op, CPU) { ...@@ -139,6 +139,7 @@ TEST(LoadFP16Op, CPU) {
save_op->Run(scope, place); save_op->Run(scope, place);
auto load_var = scope.Var("out_var"); auto load_var = scope.Var("out_var");
load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp( auto load_op = paddle::framework::OpRegistry::CreateOp(
"load", {}, {{"Out", {"out_var"}}}, attrs); "load", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place); load_op->Run(scope, place);
......
...@@ -22,11 +22,17 @@ limitations under the License. */ ...@@ -22,11 +22,17 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
// to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
// TODO(yuyang18): If the functions below are needed by other files, move them // TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace. // to paddle::filesystem namespace.
constexpr char kSEP = '/'; constexpr char kSEP = '/';
...@@ -67,9 +73,27 @@ class SaveOp : public framework::OperatorBase { ...@@ -67,9 +73,27 @@ class SaveOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto iname = Input("X");
auto *var = scope.FindVar(iname);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
iname);
if (var->IsType<framework::LoDTensor>()) {
SaveLodTensor(place, var);
} else if (var->IsType<framework::SelectedRows>()) {
SaveSelectedRows(scope, place, var);
} else {
PADDLE_ENFORCE(
false,
"SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
iname);
}
}
void SaveLodTensor(const platform::Place &place,
framework::Variable *var) const {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite"); auto overwrite = Attr<bool>("overwrite");
auto save_as_fp16 = Attr<bool>("save_as_fp16");
if (FileExists(filename) && !overwrite) { if (FileExists(filename) && !overwrite) {
PADDLE_THROW("%s is existed, cannot save to it when overwrite=false", PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
...@@ -78,26 +102,19 @@ class SaveOp : public framework::OperatorBase { ...@@ -78,26 +102,19 @@ class SaveOp : public framework::OperatorBase {
MkDirRecursively(DirName(filename).c_str()); MkDirRecursively(DirName(filename).c_str());
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto iname = Input("X");
auto *var = scope.FindVar(iname);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
iname);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"SaveOp only support LoDTensor, %s has wrong type", iname);
auto &tensor = var->Get<framework::LoDTensor>(); auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto save_as_fp16 = Attr<bool>("save_as_fp16");
auto in_dtype = framework::ToDataType(tensor.type()); auto in_dtype = framework::ToDataType(tensor.type());
auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
...@@ -112,17 +129,43 @@ class SaveOp : public framework::OperatorBase { ...@@ -112,17 +129,43 @@ class SaveOp : public framework::OperatorBase {
} else { } else {
framework::SerializeToStream(fout, tensor, dev_ctx); framework::SerializeToStream(fout, tensor, dev_ctx);
} }
fout.close();
}
void SaveSelectedRows(const framework::Scope &scope,
const platform::Place &place,
framework::Variable *var) const {
auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
PADDLE_ENFORCE(
lt_var != nullptr,
"Can not find variable kLookupTablePath for SaveSelectedRows");
std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename;
auto &selectedRows = var->Get<framework::SelectedRows>();
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// FIXME(yuyang18): We save variable to local file now, but we should change
// it to save an output stream.
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
framework::SerializeToStream(fout, selectedRows, dev_ctx);
fout.close();
} }
}; };
class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker { class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor ) Input tensor to be saved"); AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved");
AddComment(R"DOC( AddComment(R"DOC(
Save operator Save operator
This operator will serialize and write a tensor variable to file on disk. This operator will serialize and write LoDTensor / SelectedRows variable to file on disk.
)DOC"); )DOC");
AddAttr<bool>("overwrite", AddAttr<bool>("overwrite",
"(boolean, default true)" "(boolean, default true)"
...@@ -142,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk. ...@@ -142,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk.
} }
}; };
class SaveOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SaveOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker); REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
ops::SaveOpShapeInference);
...@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> { ...@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
const framework::Vector<size_t>& x_lod, /*expand source lod*/ const framework::Vector<size_t>& x_lod, /*expand source lod*/
const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/ const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
LoDTensor* dx) { LoDTensor* dx) {
math::SetConstant<platform::CPUDeviceContext, T> set_zero;
set_zero(context, dx, static_cast<T>(0));
int dout_offset = 0; int dout_offset = 0;
for (size_t i = 1; i < ref_lod.size(); ++i) { for (size_t i = 1; i < ref_lod.size(); ++i) {
int repeat_num = ref_lod[i] - ref_lod[i - 1]; int repeat_num = ref_lod[i] - ref_lod[i - 1];
...@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> { ...@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
g_x->mutable_data<T>(context.GetPlace()); g_x->mutable_data<T>(context.GetPlace());
g_x->set_lod(x->lod()); g_x->set_lod(x->lod());
auto& dev_ctx = context.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> set_zero;
set_zero(dev_ctx, g_x, static_cast<T>(0));
auto& y_lod = y->lod(); auto& y_lod = y->lod();
if (ref_level == -1) ref_level = y_lod.size() - 1; if (ref_level == -1) ref_level = y_lod.size() - 1;
// just copy the gradient // just copy the gradient
......
...@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp { ...@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
<< " to " << offset + 1; << " to " << offset + 1;
out->resize(offset + 1); out->resize(offset + 1);
} }
auto *out_tensor = &out->at(offset);
out_tensor->set_lod(x_tensor.lod());
if (x_tensor.memory_size() > 0) { if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset);
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
TensorCopy(x_tensor, place, dev_ctx, out_tensor); TensorCopy(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod());
} else { } else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array[" "nothing has been written to output array["
......
...@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T> ...@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> { class TensorRTEngineKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
VLOG(4) << "TensorRTEngineKernel executing";
auto engine_name = context.Attr<std::string>("engine_uniq_key"); auto engine_name = context.Attr<std::string>("engine_uniq_key");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) { if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context); Prepare(context);
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
...@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, ...@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
*var = *desc.Proto(); *var = *desc.Proto();
} }
template <typename T>
void SetAttr(framework::proto::OpDesc* op, const std::string& name,
const T& data);
template <>
void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
const std::string& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRING);
attr->set_s(data);
}
template <>
void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
const int& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
const int64_t& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::LONG);
attr->set_l(data);
}
template <>
void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
const std::string& name,
const std::vector<std::string>& data) {
auto* attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::STRINGS);
for (const auto& s : data) {
attr->add_strings(s.c_str());
}
}
} // namespace } // namespace
using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) { TEST(TensorRTEngineOp, manual) {
framework::ProgramDesc program; framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
......
...@@ -17,3 +17,7 @@ if (CUPTI_FOUND) ...@@ -17,3 +17,7 @@ if (CUPTI_FOUND)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
if (WITH_MKLML)
cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
endif()
# TODO(TJ): add iomp, mkldnn?
...@@ -49,6 +49,8 @@ DEFINE_string( ...@@ -49,6 +49,8 @@ DEFINE_string(
tensorrt_dir, "", tensorrt_dir, "",
"Specify path for loading tensorrt library, such as libnvinfer.so."); "Specify path for loading tensorrt library, such as libnvinfer.so.");
DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
...@@ -76,6 +78,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, ...@@ -76,6 +78,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
VLOG(3) << "Try to find library: " << dso_path VLOG(3) << "Try to find library: " << dso_path
<< " from default system path."; << " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
...@@ -97,6 +100,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, ...@@ -97,6 +100,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
} }
#endif #endif
if (nullptr == dso_handle) {
LOG(WARNING) << "Can not find library: " << dso_path
<< ". Please try to add the lib path to LD_LIBRARY_PATH.";
}
return dso_handle; return dso_handle;
} }
...@@ -206,6 +213,14 @@ void* GetTensorRtDsoHandle() { ...@@ -206,6 +213,14 @@ void* GetTensorRtDsoHandle() {
#endif #endif
} }
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
#else
return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
#endif
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle(); ...@@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle();
void* GetLapackDsoHandle(); void* GetLapackDsoHandle();
void* GetNCCLDsoHandle(); void* GetNCCLDsoHandle();
void* GetTensorRtDsoHandle(); void* GetTensorRtDsoHandle();
void* GetMKLMLDsoHandle();
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/mklml.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag mklml_dso_flag;
void* mklml_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
MKLML_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <dlfcn.h>
#include <mkl.h>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag mklml_dso_flag;
extern void* mklml_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load mklml routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
}); \
static void* p_##_name = dlsym(mklml_dso_handle, #__name); \
return reinterpret_cast<mklmlFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
#define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \
__macro(cblas_saxpy); \
__macro(cblas_scopy); \
__macro(cblas_sgemv); \
__macro(cblas_sgemm_batch); \
__macro(cblas_dgemm); \
__macro(cblas_daxpy); \
__macro(cblas_dcopy); \
__macro(cblas_dgemv); \
__macro(cblas_dgemm_batch); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
#undef DYNAMIC_LOAD_MKLML_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
...@@ -228,7 +228,7 @@ class MKLDNNHandler { ...@@ -228,7 +228,7 @@ class MKLDNNHandler {
return dstr; return dstr;
}; };
return dims2str(operand_dims) + suffix; return dims2str(operand_dims) + suffix;
}; }
protected: protected:
const MKLDNNDeviceContext& dev_ctx_; const MKLDNNDeviceContext& dev_ctx_;
...@@ -237,5 +237,15 @@ class MKLDNNHandler { ...@@ -237,5 +237,15 @@ class MKLDNNHandler {
bool is_reusing_; bool is_reusing_;
}; };
inline mkldnn::memory::format MKLDNNFormatForSize(
size_t dims_size, mkldnn::memory::format data_format) {
if (dims_size == 1) {
return mkldnn::memory::format::x;
} else if (dims_size == 2) {
return mkldnn::memory::format::nc;
}
return data_format;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -34,6 +34,7 @@ limitations under the License. */ ...@@ -34,6 +34,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -297,6 +298,37 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -297,6 +298,37 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::ReaderHolder>(m, "Reader", "") py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("reset", &framework::ReaderHolder::ReInit); .def("reset", &framework::ReaderHolder::ReInit);
using LoDTensorBlockingQueue =
::paddle::operators::reader::LoDTensorBlockingQueue;
using LoDTensorBlockingQueueHolder =
::paddle::operators::reader::LoDTensorBlockingQueueHolder;
py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
.def("push",
[](LoDTensorBlockingQueue &self,
const std::vector<framework::LoDTensor> &lod_tensor_vec) {
pybind11::gil_scoped_release release;
return self.Push(lod_tensor_vec);
})
.def("size", &LoDTensorBlockingQueue::Size)
.def("capacity", &LoDTensorBlockingQueue::Cap)
.def("close", &LoDTensorBlockingQueue::Close)
.def("is_closed", &LoDTensorBlockingQueue::IsClosed);
m.def("init_lod_tensor_blocking_queue",
[](Variable &var, size_t capacity,
const std::vector<std::vector<int64_t>> &shapes)
-> LoDTensorBlockingQueue * {
std::vector<DDim> dims(shapes.size());
std::transform(shapes.begin(), shapes.end(), dims.begin(),
[](const std::vector<int64_t> &shape) {
return make_ddim(shape);
});
auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
holder->InitOnce(capacity, dims);
return holder->GetQueue().get();
},
py::return_value_policy::reference);
py::class_<Scope>(m, "Scope", "") py::class_<Scope>(m, "Scope", "")
.def("var", .def("var",
[](Scope &self, const std::string &name) -> Variable * { [](Scope &self, const std::string &name) -> Variable * {
...@@ -463,9 +495,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -463,9 +495,11 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
.def("complete", &Executor::Complete) .def("complete", &Executor::Complete)
#endif #endif
.def("run", .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
(void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) & int block_id, bool create_local_scope, bool create_vars) {
Executor::Run); pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars);
});
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG); m.def("init_glog", framework::InitGLOG);
...@@ -631,7 +665,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -631,7 +665,12 @@ All parameter, weight, gradient are variables in Paddle.
&ParallelExecutor::FeedTensorsIntoLocalScopes) &ParallelExecutor::FeedTensorsIntoLocalScopes)
.def("feed_and_split_tensor_into_local_scopes", .def("feed_and_split_tensor_into_local_scopes",
&ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
.def("run", &ParallelExecutor::Run); .def("run", [](ParallelExecutor &self,
const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
pybind11::gil_scoped_release release;
self.Run(fetch_tensors, fetched_var_name);
});
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
return m.ptr(); return m.ptr();
......
...@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray( ...@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray(
template <> template <>
// This following specialization maps uint16_t in the parameter type to // This following specialization maps uint16_t in the parameter type to
// platform::float16. // platform::float16.
void PyCPUTensorSetFromArray( inline void PyCPUTensorSetFromArray(
framework::Tensor *self, framework::Tensor *self,
pybind11::array_t<uint16_t, pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast> pybind11::array::c_style | pybind11::array::forcecast>
...@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray( ...@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray(
template <> template <>
// This following specialization maps uint16_t in the parameter type to // This following specialization maps uint16_t in the parameter type to
// platform::float16. // platform::float16.
void PyCUDATensorSetFromArray( inline void PyCUDATensorSetFromArray(
framework::Tensor *self, framework::Tensor *self,
pybind11::array_t<uint16_t, pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast> pybind11::array::c_style | pybind11::array::forcecast>
...@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray( ...@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray(
template <> template <>
// This following specialization maps uint16_t in the parameter type to // This following specialization maps uint16_t in the parameter type to
// platform::float16. // platform::float16.
void PyCUDAPinnedTensorSetFromArray( inline void PyCUDAPinnedTensorSetFromArray(
framework::Tensor *self, framework::Tensor *self,
pybind11::array_t<uint16_t, pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast> pybind11::array::c_style | pybind11::array::forcecast>
......
...@@ -106,6 +106,8 @@ function cmake_gen() { ...@@ -106,6 +106,8 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-ON}
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
...@@ -133,7 +135,8 @@ EOF ...@@ -133,7 +135,8 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
} }
function abort(){ function abort(){
......
...@@ -111,7 +111,7 @@ def fetch(): ...@@ -111,7 +111,7 @@ def fetch():
paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
def convert(path): def convert(path):
......
...@@ -118,7 +118,8 @@ def __bootstrap__(): ...@@ -118,7 +118,8 @@ def __bootstrap__():
read_env_flags = [ read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb' 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem'
] ]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
......
...@@ -78,6 +78,8 @@ def as_numpy(tensor): ...@@ -78,6 +78,8 @@ def as_numpy(tensor):
Returns: Returns:
numpy.ndarray numpy.ndarray
""" """
if isinstance(tensor, core.LoDTensorArray):
return [as_numpy(t) for t in tensor]
if isinstance(tensor, list): if isinstance(tensor, list):
return [as_numpy(t) for t in tensor] return [as_numpy(t) for t in tensor]
assert isinstance(tensor, core.LoDTensor) assert isinstance(tensor, core.LoDTensor)
......
...@@ -27,6 +27,7 @@ __all__ = [ ...@@ -27,6 +27,7 @@ __all__ = [
'Variable', 'Variable',
'Program', 'Program',
'Operator', 'Operator',
'Parameter',
'default_startup_program', 'default_startup_program',
'default_main_program', 'default_main_program',
'program_guard', 'program_guard',
...@@ -454,7 +455,7 @@ class Operator(object): ...@@ -454,7 +455,7 @@ class Operator(object):
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
'ncclInit', 'channel_create', 'channel_close', 'channel_send', 'ncclInit', 'channel_create', 'channel_close', 'channel_send',
'channel_recv', 'select', 'gen_nccl_id' 'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
} }
def __init__(self, def __init__(self,
...@@ -559,19 +560,8 @@ class Operator(object): ...@@ -559,19 +560,8 @@ class Operator(object):
self.attrs[attr_name] is None): self.attrs[attr_name] is None):
continue continue
attr_val = self.attrs[attr_name] attr_val = self.attrs[attr_name]
if isinstance(attr_val, Block): self._update_desc_attr(attr_name, attr_val)
self.desc.set_block_attr(attr_name,
self.attrs[attr_name].desc)
elif isinstance(attr_val, list) and attr_val and \
all(isinstance(v, Block) for v in attr_val):
self.desc.set_blocks_attr(attr_name,
[v.desc for v in attr_val])
elif isinstance(attr_val, core.BlockDesc) or \
isinstance(attr_val, core.ProgramDesc):
self.desc.set_serialized_attr(
attr_name, attr_val.serialize_to_string())
else:
self.desc.set_attr(attr_name, attr_val)
self.desc.check_attrs() self.desc.check_attrs()
if self.has_kernel(type): if self.has_kernel(type):
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
...@@ -718,6 +708,19 @@ class Operator(object): ...@@ -718,6 +708,19 @@ class Operator(object):
ValueError: If the type of value doesn't match with desc.attr_type(name). ValueError: If the type of value doesn't match with desc.attr_type(name).
""" """
self.attrs[name] = val self.attrs[name] = val
self._update_desc_attr(name, val)
def _update_desc_attr(self, name, val):
"""
Update the value of desc's attribute by attribute's name.
Args:
name(str): the attribute name.
val(bool|int|str|float|list): the value of the attribute.
Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name).
"""
if isinstance(val, Block): if isinstance(val, Block):
self.desc.set_block_attr(name, val.desc) self.desc.set_block_attr(name, val.desc)
elif isinstance(val, list) and val and all( elif isinstance(val, list) and val and all(
...@@ -1212,6 +1215,9 @@ class Block(object): ...@@ -1212,6 +1215,9 @@ class Block(object):
if var.type == core.VarDesc.VarType.STEP_SCOPES: if var.type == core.VarDesc.VarType.STEP_SCOPES:
ret_var = self.create_var( ret_var = self.create_var(
name=var.name, persistable=var.persistable, type=var.type) name=var.name, persistable=var.persistable, type=var.type)
elif var.type == core.VarDesc.VarType.RAW:
ret_var = self.create_var(
name=var.name, persistable=var.persistable, type=var.type)
elif var.type == core.VarDesc.VarType.SELECTED_ROWS: elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
ret_var = self.create_var( ret_var = self.create_var(
name=var.name, name=var.name,
...@@ -1917,11 +1923,11 @@ def program_guard(main_program, startup_program=None): ...@@ -1917,11 +1923,11 @@ def program_guard(main_program, startup_program=None):
def get_var(name, program=None): def get_var(name, program=None):
""" """
Get a variable by name from the global block of a program. Get a variable by name from the global block of a program.
Args: Args:
name(str): name of the variable name(str): name of the variable
program(Program|None): program object. program(Program|None): program object.
If None, default_global_program() will be used. If None, default_global_program() will be used.
Returns: Returns:
Variable Variable
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import os import os
import errno
import time import time
import shutil import shutil
...@@ -25,7 +26,8 @@ __all__ = [ ...@@ -25,7 +26,8 @@ __all__ = [
'load_persistables', 'save_inference_model', 'load_inference_model', 'load_persistables', 'save_inference_model', 'load_inference_model',
'get_inference_program', 'save_checkpoint', 'load_checkpoint', 'get_inference_program', 'save_checkpoint', 'load_checkpoint',
'clean_checkpoint', 'load_persist_vars_without_grad', 'clean_checkpoint', 'load_persist_vars_without_grad',
'save_persist_vars_without_grad', 'get_latest_checkpoint_serial' 'load_lookup_table_vars', 'save_persist_vars_without_grad',
'get_latest_checkpoint_serial'
] ]
...@@ -795,6 +797,7 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -795,6 +797,7 @@ def get_parameter_value_by_name(name, executor, program=None):
SUCCESS_MARK_FILENAME = "_SUCCESS" SUCCESS_MARK_FILENAME = "_SUCCESS"
CHECKPOINT_PREFIX = "checkpoint" CHECKPOINT_PREFIX = "checkpoint"
MODEL_DIR = "__model__" MODEL_DIR = "__model__"
LOOKUP_TABLE_DIR = "__lookup_table__"
TRAINER_PREFIX = "trainer" TRAINER_PREFIX = "trainer"
CHECKPOINT_SEPARATOR = "_" CHECKPOINT_SEPARATOR = "_"
...@@ -804,7 +807,9 @@ def save_checkpoint(executor, ...@@ -804,7 +807,9 @@ def save_checkpoint(executor,
trainer_id, trainer_id,
trainer_args=None, trainer_args=None,
main_program=None, main_program=None,
max_num_checkpoints=3): max_num_checkpoints=3,
lookup_table=None,
ps_endpoint_list=None):
""" """
This function filters out all checkpoint variables from the give This function filters out all checkpoint variables from the give
main_program and then saves these variables to the `checkpoint_dir` main_program and then saves these variables to the `checkpoint_dir`
...@@ -836,6 +841,12 @@ def save_checkpoint(executor, ...@@ -836,6 +841,12 @@ def save_checkpoint(executor,
max_num_checkpoints(int): The max number of total number of existing max_num_checkpoints(int): The max number of total number of existing
checkpoints. checkpoints.
Default: 3 Default: 3
lookup_table(string|None): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list|None): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Returns: Returns:
None None
...@@ -852,30 +863,40 @@ def save_checkpoint(executor, ...@@ -852,30 +863,40 @@ def save_checkpoint(executor,
prog = fluid.default_main_program() prog = fluid.default_main_program()
trainer_args = {"epoch_id": 200, trainer_args = {"epoch_id": 200,
"step_id": 20} # just an example "step_id": 20} # just an example
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
fluid.io.save_checkpoint(executor=exe, fluid.io.save_checkpoint(executor=exe,
checkpoint_dir=path, checkpoint_dir=path,
trainer_id=0, trainer_id=0,
trainer_args=trainer_args, trainer_args=trainer_args,
main_program=prog, main_program=prog,
max_num_checkpoints=3) max_num_checkpoints=3,
lookup_table=table_name,
ps_endpoint_list = ps_endpoints)
""" """
if checkpoint_dir is None: if checkpoint_dir is None:
raise ValueError("'checkpoint_dir' should not be None") raise ValueError("'checkpoint_dir' should not be None")
assert checkpoint_dir
if trainer_args: if trainer_args:
assert isinstance(trainer_args, dict) assert isinstance(trainer_args, dict)
if not os.path.isdir(checkpoint_dir): is_chief = trainer_id == 0
os.makedirs(checkpoint_dir)
_make_chekcpoint_dirs(checkpoint_dir)
serial = get_latest_checkpoint_serial(checkpoint_dir) + 1 serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
cur_dir = _get_serial_dir(checkpoint_dir, serial) cur_dir = _get_serial_dir(checkpoint_dir, serial)
save_trainer_args(cur_dir, trainer_id, trainer_args) save_trainer_args(cur_dir, trainer_id, trainer_args)
if trainer_id == 0: if is_chief:
save_persist_vars_without_grad(executor, cur_dir, main_program) save_persist_vars_without_grad(executor, cur_dir, main_program)
if is_chief and lookup_table and ps_endpoint_list:
save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
ps_endpoint_list)
_scroll_delete(checkpoint_dir, max_num_checkpoints) _scroll_delete(checkpoint_dir, max_num_checkpoints)
...@@ -942,8 +963,9 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program): ...@@ -942,8 +963,9 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
def clean_checkpoint(checkpoint_dir, delete_dir=False): def clean_checkpoint(checkpoint_dir, delete_dir=False):
""" """
clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before. clean the checkpoint dir, when the train exits normally,
delete_dir only works when the directory is empty, otherwise, OSError is raised. the trainer will call clean_checkpoint to delete checkpoint directory saved before.
delete_dir only works when the directory is empty, otherwise, OSError is raised.
: param checkpoint_dir : param checkpoint_dir
: param delete_dir : param delete_dir
...@@ -1009,6 +1031,56 @@ def load_persist_vars_without_grad(executor, ...@@ -1009,6 +1031,56 @@ def load_persist_vars_without_grad(executor,
filename=None) filename=None)
def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
"""
The parameter server will load lookup table's local file in
selectedrows variable.
Args:
executor(Executor): The executor to run for loading persistable variables
dirname(str): The directory path
main_program(Program): Find the variable named table_name in main_program
pserver_id(int): the serial number in pserver_endpoints list
table_name(str): lookup table name
Returns:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
dirname = "./checkpoints/checkpoint_9/__model__"
prog = fluid.default_main_program()
pserver_id = 1
table_name = "share_w"
fluid.io.load_lookup_table_vars(executor=exe,
dirname=dirname, program=prog, pserver_id=pserver_id,
table_name=table_name)
"""
for var in program.list_vars():
if var.name == table_name:
lookup_table_var = var
break
assert lookup_table_var is not None
lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
load_prog = Program()
load_block = load_prog.global_block()
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [lookup_table_var]},
attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
executor.run(load_prog)
def save_persist_vars_without_grad(executor, dirname, program): def save_persist_vars_without_grad(executor, dirname, program):
""" """
This function filters out all checkpoint variables from the give This function filters out all checkpoint variables from the give
...@@ -1055,6 +1127,54 @@ def save_persist_vars_without_grad(executor, dirname, program): ...@@ -1055,6 +1127,54 @@ def save_persist_vars_without_grad(executor, dirname, program):
_write_success(cur_dir) _write_success(cur_dir)
def save_pserver_vars_by_notify(executor, dirname, lookup_table,
ps_endpoint_list):
"""
This function will send checkpoint notify message from Trainer 0
to all the pservers.
The checkpoint notify message contains lookup table name,
the absolute path on pserver to save lookup_table.
Args:
executor(Executor): The executor to run for send checkpoint notify.
dirname(str): The folder where to save checkpoints.
lookup_table(string): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Return:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
prog = fluid.default_main_program()
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
fluid.io.save_pserver_vars_by_notify(executor=exe,
dirname=param_path, lookup_table=table_name,
ps_endpoint_list=ps_endpoints)
"""
cur_dir = _get_lookuptable_dir(dirname)
checkpoint_notify_program = Program()
checkpoint_notify_block = checkpoint_notify_program.global_block()
attrs = {}
attrs['epmap'] = ps_endpoint_list
attrs['dir'] = cur_dir
attrs['lookup_table'] = lookup_table
checkpoint_notify_block.append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
executor.run(checkpoint_notify_program)
def save_trainer_args(dirname, trainer_id, trainer_args): def save_trainer_args(dirname, trainer_id, trainer_args):
assert isinstance(trainer_args, dict) assert isinstance(trainer_args, dict)
...@@ -1068,6 +1188,29 @@ def save_trainer_args(dirname, trainer_id, trainer_args): ...@@ -1068,6 +1188,29 @@ def save_trainer_args(dirname, trainer_id, trainer_args):
def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args): def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
"""
trainer will load some args from it's independent directory,
such as epoch_id and step_id.
Args:
checkpoint_dir(str): The folder where all checkpoints are.
serial(int): The serial of checkpoint you would like to load.
trainer_id(int): current trainer id.
trainer_args(list): list about load trainer args
Return:
None
Examples:
.. code-block:: python
param_path = "./checkpoint/"
serial = 7
trainer_id = 2
trainer_args = ["epoch_id", "step_id"]
fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
trainer_id=trainer_id, trainer_args=trainer_args)
"""
assert isinstance(trainer_args, list) assert isinstance(trainer_args, list)
cur_dir = _get_serial_dir(checkpoint_dir, serial) cur_dir = _get_serial_dir(checkpoint_dir, serial)
...@@ -1088,7 +1231,7 @@ def _is_checkpoint_var(var): ...@@ -1088,7 +1231,7 @@ def _is_checkpoint_var(var):
the checkpoint will not save or load all the variables. the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded. var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var : param var(Variable)
""" """
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
...@@ -1108,6 +1251,23 @@ def _is_checkpoint_var(var): ...@@ -1108,6 +1251,23 @@ def _is_checkpoint_var(var):
return var.persistable return var.persistable
def _make_chekcpoint_dirs(dirs):
"""
_make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
"""
assert dirs is not None
if os.path.isfile(dirs):
raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
if not os.path.isdir(dirs):
try:
os.makedirs(dirs)
except OSError as err:
if err.errno != errno.EEXIST:
raise err
def _get_dir_serial(dirname): def _get_dir_serial(dirname):
_, serial = dirname.split(CHECKPOINT_SEPARATOR) _, serial = dirname.split(CHECKPOINT_SEPARATOR)
...@@ -1121,29 +1281,27 @@ def _get_dir_serial(dirname): ...@@ -1121,29 +1281,27 @@ def _get_dir_serial(dirname):
def _get_serial_dir(dirname, serial): def _get_serial_dir(dirname, serial):
serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial) serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
serial_dir = os.path.join(dirname, serial_folder) serial_dir = os.path.join(dirname, serial_folder)
_make_chekcpoint_dirs(serial_dir)
if not os.path.isdir(serial_dir):
os.makedirs(serial_dir)
return serial_dir return serial_dir
def _get_model_dir(dirname): def _get_model_dir(dirname):
model_dir = os.path.join(dirname, MODEL_DIR) model_dir = os.path.join(dirname, MODEL_DIR)
_make_chekcpoint_dirs(model_dir)
return model_dir
if not os.path.isdir(model_dir):
os.makedirs(model_dir)
return model_dir def _get_lookuptable_dir(dirname):
lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
_make_chekcpoint_dirs(lookuptable_dir)
return lookuptable_dir
def _get_trainer_dir(dirname, trainer_id): def _get_trainer_dir(dirname, trainer_id):
trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id) trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
trainer_dir = os.path.join(dirname, trainer_folder) trainer_dir = os.path.join(dirname, trainer_folder)
_make_chekcpoint_dirs(trainer_dir)
if not os.path.isdir(trainer_dir):
os.makedirs(trainer_dir)
return trainer_dir return trainer_dir
...@@ -1162,7 +1320,11 @@ def _scroll_delete(dirname, max_num_checkpoints=3): ...@@ -1162,7 +1320,11 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
serials = serials[max_num_checkpoints:] serials = serials[max_num_checkpoints:]
for serial in serials: for serial in serials:
cur_dir = _get_serial_dir(dirname, serial) cur_dir = _get_serial_dir(dirname, serial)
shutil.rmtree(cur_dir) try:
shutil.rmtree(cur_dir)
except OSError as err:
if err.errno != errno.ENOENT:
raise err
def _write_success(dirname): def _write_success(dirname):
......
...@@ -110,7 +110,7 @@ class BlockGuardServ(BlockGuard): ...@@ -110,7 +110,7 @@ class BlockGuardServ(BlockGuard):
class ListenAndServ(object): class ListenAndServ(object):
""" """
**ListenAndServ Layer** **ListenAndServ Layer**
ListenAndServ is used to create a rpc server bind and listen ListenAndServ is used to create a rpc server bind and listen
on specific TCP port, this server will run the sub-block when on specific TCP port, this server will run the sub-block when
received variables from clients. received variables from clients.
...@@ -212,7 +212,7 @@ def Send(endpoints, send_vars, sync=True): ...@@ -212,7 +212,7 @@ def Send(endpoints, send_vars, sync=True):
of send_vars to send of send_vars to send
send_vars (list): variables to send to server send_vars (list): variables to send to server
sync (bool): whether to wait the request finish sync (bool): whether to wait the request finish
""" """
assert (type(send_vars) == list) assert (type(send_vars) == list)
...@@ -469,10 +469,13 @@ def open_files(filenames, ...@@ -469,10 +469,13 @@ def open_files(filenames,
lod_levels(list): List of ints which declaring data lod_level. lod_levels(list): List of ints which declaring data lod_level.
dtypes(list): List of strs which declaring data type. dtypes(list): List of strs which declaring data type.
thread_num(int): The maximal concurrent prefetch thread number. thread_num(int): The maximal concurrent prefetch thread number.
buffer_size(int): The size of prefetch buffer. buffer_size(int|None): The size of prefetch buffer. If it is setted None,
buffer size will be thread_num * 3.
Default: None
pass_num(int): Number of passes to run. pass_num(int): Number of passes to run.
for_parallel(Bool): Set it as True if you are going to run for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel. subsequent operators in parallel.
Default: True
Returns: Returns:
Variable: A Reader Variable via which we can get file data. Variable: A Reader Variable via which we can get file data.
...@@ -492,7 +495,7 @@ def open_files(filenames, ...@@ -492,7 +495,7 @@ def open_files(filenames,
image, label = fluid.layers.io.read_file(reader) image, label = fluid.layers.io.read_file(reader)
""" """
if buffer_size is None: if buffer_size is None:
buffer_size = thread_num buffer_size = thread_num * 3
if isinstance(filenames, basestring): if isinstance(filenames, basestring):
filenames = [filenames] filenames = [filenames]
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
......
...@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc ...@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
from tensor import concat from tensor import concat
import utils import utils
import random import random
from .. import unique_name
__all__ = [ __all__ = [
'fc', 'fc',
...@@ -1992,7 +1993,8 @@ def batch_norm(input, ...@@ -1992,7 +1993,8 @@ def batch_norm(input,
name=None, name=None,
moving_mean_name=None, moving_mean_name=None,
moving_variance_name=None, moving_variance_name=None,
do_model_average_for_mean_and_var=False): do_model_average_for_mean_and_var=False,
fuse_with_relu=False):
""" """
**Batch Normalization Layer** **Batch Normalization Layer**
...@@ -2035,6 +2037,7 @@ def batch_norm(input, ...@@ -2035,6 +2037,7 @@ def batch_norm(input,
moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
Returns: Returns:
Variable: A tensor variable which is the result after applying batch normalization on the input. Variable: A tensor variable which is the result after applying batch normalization on the input.
...@@ -2120,7 +2123,8 @@ def batch_norm(input, ...@@ -2120,7 +2123,8 @@ def batch_norm(input,
"momentum": momentum, "momentum": momentum,
"epsilon": epsilon, "epsilon": epsilon,
"is_test": is_test, "is_test": is_test,
"use_mkldnn": use_mkldnn "use_mkldnn": use_mkldnn,
"fuse_with_relu": fuse_with_relu
}) })
return helper.append_activation(batch_norm_out) return helper.append_activation(batch_norm_out)
...@@ -2223,56 +2227,6 @@ def layer_norm(input, ...@@ -2223,56 +2227,6 @@ def layer_norm(input,
return helper.append_activation(layer_norm_out) return helper.append_activation(layer_norm_out)
def beam_search_decode(ids, scores, name=None):
"""
Beam Search Decode
This layers is to pack the output of beam search layer into sentences and
associated scores. It is usually called after the beam search layer.
Typically, the output of beam search layer is a tensor of selected ids, with
a tensor of the score of each id. Beam search layer's output ids, however,
are generated directly during the tree search, and they are stacked by each
level of the search tree. Thus we need to reorganize them into sentences,
based on the score of each id. This layer takes the output of beam search
layer as input and repack them into sentences.
Args:
ids (Variable): The selected ids, output of beam search layer.
scores (Variable): The associated scores of the ids, out put of beam
search layer.
name (str): The name of this layer. It is optional.
Returns:
tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
sentence_ids is a tensor with shape [size, length], where size is the
beam size of beam search, and length is the length of each sentence.
Note that the length of sentences may vary.
sentence_scores is a tensor with the same shape as sentence_ids.
Examples:
.. code-block:: python
ids, scores = fluid.layers.beam_search(
pre_ids, ids, scores, beam_size, end_id)
sentence_ids, sentence_scores = fluid.layers.beam_search_decode(
ids, scores)
"""
helper = LayerHelper('beam_search_decode', **locals())
sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
helper.append_op(
type="beam_search_decode",
inputs={"Ids": ids,
"Scores": scores},
outputs={
"SentenceIds": sentence_ids,
"SentenceScores": sentence_scores
})
return sentence_ids, sentence_scores
def conv2d_transpose(input, def conv2d_transpose(input,
num_filters, num_filters,
output_size=None, output_size=None,
...@@ -2383,10 +2337,17 @@ def conv2d_transpose(input, ...@@ -2383,10 +2337,17 @@ def conv2d_transpose(input,
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
""" """
helper = LayerHelper("conv2d_transpose", **locals())
input_channel = input.shape[1]
op_type = 'conv2d_transpose'
if (input_channel == groups and num_filters == input_channel and
not use_cudnn):
op_type = 'depthwise_conv2d_transpose'
helper = LayerHelper(op_type, **locals())
if not isinstance(input, Variable): if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable") raise TypeError("Input of conv2d_transpose must be Variable")
input_channel = input.shape[1]
padding = utils.convert_to_list(padding, 2, 'padding') padding = utils.convert_to_list(padding, 2, 'padding')
stride = utils.convert_to_list(stride, 2, 'stride') stride = utils.convert_to_list(stride, 2, 'stride')
...@@ -2420,7 +2381,7 @@ def conv2d_transpose(input, ...@@ -2420,7 +2381,7 @@ def conv2d_transpose(input,
pre_bias = helper.create_tmp_variable(dtype=input.dtype) pre_bias = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type='conv2d_transpose', type=op_type,
inputs={'Input': [input], inputs={'Input': [input],
'Filter': [img_filter]}, 'Filter': [img_filter]},
outputs={'Output': pre_bias}, outputs={'Output': pre_bias},
...@@ -2676,38 +2637,89 @@ def sequence_expand(x, y, ref_level=-1, name=None): ...@@ -2676,38 +2637,89 @@ def sequence_expand(x, y, ref_level=-1, name=None):
return tmp return tmp
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): def beam_search(pre_ids,
''' pre_scores,
**beam search** ids,
scores,
This function implements the beam search algorithm. beam_size,
end_id,
Beam search is a classical algorithm for selecting candidate words level=0,
in a machine translation task. name=None):
"""
Beam search is a classical algorithm for selecting candidate words in a
machine translation task.
Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_ Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
for more details. for more details.
This layer does the search in beams for one time step. Specifically, it
selects the top-K candidate word ids of current step from :attr:`ids`
according to their :attr:`scores` for all source sentences, where K is
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the
computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
the output of beam_search at previous step, they are needed for special use
to handle ended candidate translations.
Note that the :attr:`scores` passed in should be accumulated scores, and
length penalty should be done with extra operators before calculating the
accumulated scores if needed, also suggest finding top-K before it and
using the top-K candidates following.
Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py
Args: Args:
pre_ids (Variable): ids in previous step. pre_ids(Variable): The LodTensor variable which is the output of
ids (Variable): a LoDTensor of shape of [None,k] beam_search at previous step. It should be a LodTensor with shape
scores (Variable): a LoDTensor that has the same shape and LoD with `ids` :math:`(batch_size, 1)` and lod
beam_size (int): beam size for beam search :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
end_id (int): the token id which indicates the end of a sequence first step.
level (int): the level of LoDTensor pre_scores(Variable): The LodTensor variable which is the output of
beam_search at previous step.
ids(Variable): The LodTensor variable containing the candidates ids.
Its shape should be :math:`(batch_size \\times beam_size, K)`,
where :math:`K` supposed to be :attr:`beam_size`.
scores(Variable): The LodTensor variable containing the accumulated
scores corresponding to :attr:`ids` and its shape is the same as
the shape of :attr:`ids`.
beam_size(int): The beam width used in beam search.
end_id(int): The id of end token.
level(int, default 0): It can be ignored and mustn't change currently.
It means the source level of lod, which is explained as following.
The lod level of :attr:`ids` should be 2. The first level is source
level which describes how many prefixes (branchs) for each source
sentece (beam), and the second level is sentence level which
describes how these candidates belong to the prefix. The paths
linking prefixes and selected candidates are organized and reserved
in lod.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
tuple: a tuple of beam_search output variables: `selected_ids`, `selected_scores` Variable: The LodTensor pair containing the selected ids and the \
corresponding scores.
Examples: Examples:
.. code-block:: python .. code-block:: python
# current_score is a Tensor of shape (num_batch_size, embed_size), which # Suppose `probs` contains predicted results from the computation
# consists score of each candidate word. # cell and `pre_ids` and `pre_scores` is the output of beam_search
topk_scores, topk_indices = pd.topk(current_score, k=50) # at previous step.
selected_ids, selected_scores = pd.beam_search( topk_scores, topk_indices = layers.topk(probs, k=beam_size)
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) accu_scores = layers.elementwise_add(
''' x=layers.log(x=topk_scores)),
y=layers.reshape(
pre_scores, shape=[-1]),
axis=0)
selected_ids, selected_scores = layers.beam_search(
pre_ids=pre_ids,
pre_scores=pre_scores,
ids=topk_indices,
scores=accu_scores,
beam_size=beam_size,
end_id=end_id)
"""
helper = LayerHelper('beam_search', **locals()) helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype score_type = scores.dtype
id_type = ids.dtype id_type = ids.dtype
...@@ -2719,6 +2731,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): ...@@ -2719,6 +2731,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
type='beam_search', type='beam_search',
inputs={ inputs={
'pre_ids': pre_ids, 'pre_ids': pre_ids,
'pre_scores': pre_scores,
'ids': ids, 'ids': ids,
'scores': scores, 'scores': scores,
}, },
...@@ -2736,6 +2749,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): ...@@ -2736,6 +2749,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
return selected_ids, selected_scores return selected_ids, selected_scores
def beam_search_decode(ids, scores, beam_size, end_id, name=None):
"""
Beam Search Decode Layer. This layer constructs the full hypotheses for
each source sentence by walking back along the LoDTensorArray :attr:`ids`
whose lods can be used to restore the path in the beam search tree.
Please see the following demo for a fully beam search usage example:
fluid/tests/book/test_machine_translation.py
Args:
ids(Variable): The LodTensorArray variable containing the selected ids
of all steps.
scores(Variable): The LodTensorArray variable containing the selected
scores of all steps.
beam_size(int): The beam width used in beam search.
end_id(int): The id of end token.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The LodTensor pair containing the generated id sequences \
and the corresponding scores. The shapes and lods of the two \
LodTensor are same. The lod level is 2 and the two levels \
separately indicate how many hypotheses each source sentence has \
and how many ids each hypothesis has.
Examples:
.. code-block:: python
# Suppose `ids` and `scores` are LodTensorArray variables reserving
# the selected ids and scores of all steps
finished_ids, finished_scores = layers.beam_search_decode(
ids, scores, beam_size=5, end_id=0)
"""
helper = LayerHelper('beam_search_decode', **locals())
sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
helper.append_op(
type="beam_search_decode",
inputs={"Ids": ids,
"Scores": scores},
outputs={
"SentenceIds": sentence_ids,
"SentenceScores": sentence_scores
},
attrs={"beam_size": beam_size,
"end_id": end_id})
return sentence_ids, sentence_scores
def lstm_unit(x_t, def lstm_unit(x_t,
hidden_t_prev, hidden_t_prev,
cell_t_prev, cell_t_prev,
...@@ -4266,14 +4329,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4266,14 +4329,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
say :attr:`actual_shape` has a higher priority say :attr:`actual_shape` has a higher priority
than :attr:`shape`. than :attr:`shape`.
act (str): The non-linear activation to be applied to output variable. act (str): The non-linear activation to be applied to output variable.
inplace(bool): If this flag is set true, a new output tensor is created inplace(bool): If this flag is set true, the output
whose data is copied from input x, otherwise the output shares data with input without copying, otherwise
shares data with input without copying. a new output tensor is created
whose data is copied from input x.
name (str): The name of this layer. It is optional. name (str): The name of this layer. It is optional.
Returns: Returns:
Variable: The output tensor. Variable: The output tensor.
Raises:
TypeError: if actual_shape is neither Variable nor None.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -4285,6 +4352,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4285,6 +4352,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
if not (isinstance(shape, list) or isinstance(shape, tuple)): if not (isinstance(shape, list) or isinstance(shape, tuple)):
raise ValueError("Input shape must be a python lsit or tuple.") raise ValueError("Input shape must be a python lsit or tuple.")
inputs = {"X": x}
if isinstance(actual_shape, Variable):
inputs["Shape"] = actual_shape
elif actual_shape is not None:
raise TypeError("actual_shape should either be Variable or None")
# Validate the shape # Validate the shape
unk_dim_idx = -1 unk_dim_idx = -1
...@@ -4305,9 +4377,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4305,9 +4377,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
reshaped = helper.create_tmp_variable(dtype=x.dtype) reshaped = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape", type="reshape",
inputs={"X": x, inputs=inputs,
"Shape": actual_shape}
if isinstance(actual_shape, Variable) else {"X": x},
attrs={"shape": shape, attrs={"shape": shape,
"inplace": inplace}, "inplace": inplace},
outputs={"Out": reshaped}) outputs={"Out": reshaped})
...@@ -4889,47 +4959,39 @@ def random_crop(x, shape, seed=None): ...@@ -4889,47 +4959,39 @@ def random_crop(x, shape, seed=None):
>>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224]) >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
""" """
helper = LayerHelper("random_crop", **locals()) helper = LayerHelper("random_crop", **locals())
dtype = helper.input_dtype() dtype = x.dtype
out = helper.create_tmp_variable(dtype) out = helper.create_tmp_variable(dtype)
if seed is None: if seed is None:
seed = random.randint(-65536, 65535) seed = random.randint(-65536, 65535)
op_attrs = {"shape": shape}
if isinstance(seed, int): if isinstance(seed, int):
seed_value = seed op_attrs["startup_seed"] = seed
seed = helper.create_tmp_variable(dtype="int64") seed = helper.create_variable(
helper.append_op( name=unique_name.generate("random_crop_seed"),
type="fill_constant", dtype="int64",
inputs={}, persistable=True)
outputs={"Out": seed},
attrs={
"dtype": seed.dtype,
"shape": [1],
"value": float(seed_value),
"force_cpu": True
})
elif not isinstance(seed, Variable): elif not isinstance(seed, Variable):
raise ValueError("'seed' must be a Variable or an int.") raise ValueError("'seed' must be a Variable or an int.")
seed_out = helper.create_tmp_variable(dtype="int64")
helper.append_op( helper.append_op(
type="random_crop", type="random_crop",
inputs={"X": x, inputs={"X": x,
"Seed": seed}, "Seed": seed},
outputs={"Out": out, outputs={"Out": out,
"SeedOut": seed_out}, "SeedOut": seed},
attrs={"shape": shape}) attrs=op_attrs)
return out return out
def log(input): def log(x):
""" """
Calculates the natural log of the given input tensor, element-wise. Calculates the natural log of the given input tensor, element-wise.
.. math:: .. math::
Out = \\ln(input) Out = \\ln(x)
Args: Args:
input (Variable): Input tensor. x (Variable): Input tensor.
Returns: Returns:
Variable: The natural log of the input tensor computed element-wise. Variable: The natural log of the input tensor computed element-wise.
...@@ -4938,7 +5000,7 @@ def log(input): ...@@ -4938,7 +5000,7 @@ def log(input):
.. code-block:: python .. code-block:: python
output = fluid.layers.log(input) output = fluid.layers.log(x)
""" """
helper = LayerHelper('log', **locals()) helper = LayerHelper('log', **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
...@@ -4947,18 +5009,18 @@ def log(input): ...@@ -4947,18 +5009,18 @@ def log(input):
return out return out
def relu(input): def relu(x):
""" """
Relu takes one input data (Tensor) and produces one output data (Tensor) Relu takes one input data (Tensor) and produces one output data (Tensor)
where the rectified linear function, y = max(0, input), is applied to where the rectified linear function, y = max(0, x), is applied to
the tensor elementwise. the tensor elementwise.
.. math:: .. math::
Out = \\max(0, input) Out = \\max(0, x)
Args: Args:
input (Variable): The input tensor. x (Variable): The input tensor.
Returns: Returns:
Variable: The output tensor with the same shape as input. Variable: The output tensor with the same shape as input.
...@@ -4967,7 +5029,7 @@ def relu(input): ...@@ -4967,7 +5029,7 @@ def relu(input):
.. code-block:: python .. code-block:: python
output = fluid.layers.relu(input) output = fluid.layers.relu(x)
""" """
helper = LayerHelper('relu', **locals()) helper = LayerHelper('relu', **locals())
dtype = helper.input_dtype(input_param_name='x') dtype = helper.input_dtype(input_param_name='x')
...@@ -5015,12 +5077,12 @@ def mean_iou(input, label, num_classes): ...@@ -5015,12 +5077,12 @@ def mean_iou(input, label, num_classes):
out_correct = helper.create_tmp_variable(dtype='int32') out_correct = helper.create_tmp_variable(dtype='int32')
helper.append_op( helper.append_op(
type="mean_iou", type="mean_iou",
inputs={"predictions": input, inputs={"Predictions": input,
"labels": label}, "Labels": label},
outputs={ outputs={
"out_mean_iou": out_mean_iou, "OutMeanIou": out_mean_iou,
"out_wrong": out_wrong, "OutWrong": out_wrong,
"out_correct": out_correct "OutCorrect": out_correct
}, },
attrs={"num_classes": num_classes}) attrs={"num_classes": num_classes})
return out_mean_iou, out_wrong, out_correct return out_mean_iou, out_wrong, out_correct
......
...@@ -156,7 +156,7 @@ def cast(x, dtype): ...@@ -156,7 +156,7 @@ def cast(x, dtype):
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name='x', shape=[13], dtype='float32') data = fluid.layers.data(name='x', shape=[13], dtype='float32')
result = fluid.layers.cast(x=data, dtype='float64') result = fluid.layers.cast(x=data, dtype='float64')
""" """
...@@ -189,7 +189,7 @@ def concat(input, axis=0, name=None): ...@@ -189,7 +189,7 @@ def concat(input, axis=0, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth]) out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
""" """
helper = LayerHelper('concat', **locals()) helper = LayerHelper('concat', **locals())
...@@ -239,7 +239,7 @@ def sums(input, out=None): ...@@ -239,7 +239,7 @@ def sums(input, out=None):
return out return out
def assign(input, output): def assign(input, output=None):
""" """
**Assign** **Assign**
...@@ -247,7 +247,7 @@ def assign(input, output): ...@@ -247,7 +247,7 @@ def assign(input, output):
Args: Args:
input(Variable|numpy.ndarray): The source variable input(Variable|numpy.ndarray): The source variable
output(Variable): The destination variable output(Variable|None): The destination variable
Returns: Returns:
Variable: The destination variable that was supplied as the *output*. Variable: The destination variable that was supplied as the *output*.
...@@ -260,6 +260,8 @@ def assign(input, output): ...@@ -260,6 +260,8 @@ def assign(input, output):
fluid.layers.assign(hidden, out) fluid.layers.assign(hidden, out)
""" """
helper = LayerHelper('assign', **locals()) helper = LayerHelper('assign', **locals())
if output is None:
output = helper.create_tmp_variable(dtype=input.dtype)
if isinstance(input, Variable): if isinstance(input, Variable):
helper.append_op( helper.append_op(
type='assign', inputs={'X': [input]}, outputs={'Out': [output]}) type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
...@@ -443,7 +445,7 @@ def argmax(x, axis=0): ...@@ -443,7 +445,7 @@ def argmax(x, axis=0):
return out return out
def argsort(input, axis=-1): def argsort(input, axis=-1, name=None):
""" """
Performs sorting on the input Variable along the given axis, and outputs Performs sorting on the input Variable along the given axis, and outputs
sorted data Varibale and its corresponding index Variable with the same sorted data Varibale and its corresponding index Variable with the same
...@@ -471,6 +473,8 @@ def argsort(input, axis=-1): ...@@ -471,6 +473,8 @@ def argsort(input, axis=-1):
axis(int): The axis along which to sort the input Variable. When axis(int): The axis along which to sort the input Variable. When
:attr:`axis` < 0, the actual axis will be :attr:`axis` + :attr:`axis` < 0, the actual axis will be :attr:`axis` +
rank(:attr:`input`). Default -1, the last dimension. rank(:attr:`input`). Default -1, the last dimension.
name(str|None): (optional) A name for this layer. If set None, the
layer will be named automatically.
Returns: Returns:
tuple: A tuple of sorted data Variable and the sorted indices. tuple: A tuple of sorted data Variable and the sorted indices.
...@@ -488,8 +492,8 @@ def argsort(input, axis=-1): ...@@ -488,8 +492,8 @@ def argsort(input, axis=-1):
type='argsort', type='argsort',
inputs={'X': input}, inputs={'X': input},
outputs={'Out': out, outputs={'Out': out,
'Indics': ids}, 'Indices': ids},
attts={'axis': axis}) attrs={'axis': axis})
return out, ids return out, ids
......
...@@ -18,15 +18,16 @@ import numpy as np ...@@ -18,15 +18,16 @@ import numpy as np
__all__ = ['create_lod_tensor', 'create_random_int_lodtensor'] __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
def create_lod_tensor(data, lod, place): def create_lod_tensor(data, recursive_seq_lens, place):
""" """
Create a lod tensor from a numpy array, a list, or an existing lod tensor. Create a lod tensor from a numpy array, a list, or an existing lod tensor.
Create a lod tensor by doing the following: Create a lod tensor by doing the following:
1. Check that the length-based input lod is valid. 1. Check that the length-based level of detail (LoD) also known as
recursive_sequence_lengths of the input is valid.
2. Convert the length-based lod to a offset-based LoD. 2. Convert recursive_sequence_lengths to a offset-based LoD.
3. Copy the data from a numpy array, a list or a existing lod tensor to 3. Copy the data from a numpy array, a list or a existing lod tensor to
CPU or GPU device (based on input place). CPU or GPU device (based on input place).
...@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place): ...@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place):
Suppose we want LoDTensor to hold data for sequences of word, where each Suppose we want LoDTensor to hold data for sequences of word, where each
word is represented by an integer. If we want to create a LoDTensor to word is represented by an integer. If we want to create a LoDTensor to
represent two sentences, one of 2 words, and one of 3 words. represent two sentences, one of 2 words, and one of 3 words.
Then :code:`data` can be a numpy array of integers with shape (5, 1). Then :code:`data` can be a numpy array of integers with shape (5, 1).
:code:`lod` will be [[2, 3]], indicating the length(# of words) in each :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
sentence. This length-based input lod [[2, 3]] will be converted to sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
offset-based lod [[0, 2, 5]] inside the function call. offset-based LoD [[0, 2, 5]] inside the function call.
Please reference :ref:`api_guide_low_level_lod_tensor` for more details Please reference :ref:`api_guide_low_level_lod_tensor` for more details
regarding LoD. regarding LoD.
Args: Args:
data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
list holding the data to be copied. list holding the data to be copied.
lod(list): a list of lists indicating the length-based LoD info recursive_seq_lens(list): a list of lists indicating the length-based level of detail
specified by the user. info specified by the user.
place(Place): CPU or GPU place indicating where the data in the new place(Place): CPU or GPU place indicating where the data in the new
LoDTensor will be stored. LoDTensor will be stored.
Returns: Returns:
A fluid LoDTensor object with tensor data and lod info. A fluid LoDTensor object with tensor data and recursive_seq_lens info.
""" """
if isinstance(data, core.LoDTensor): if isinstance(data, core.LoDTensor):
return create_lod_tensor(np.array(data), lod, place) return create_lod_tensor(np.array(data), recursive_seq_lens, place)
elif isinstance(data, list): elif isinstance(data, list):
# When input data is a list, it only deal with the case where the base element # When input data is a list, it only deal with the case where the base element
# is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
# LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
# of words or other indexes in the sequence. # of words or other indexes in the sequence.
new_lod = [] new_recursive_seq_lens = []
for seq in data: for seq in data:
new_lod.append(len(seq)) new_recursive_seq_lens.append(len(seq))
assert [new_lod] == lod, "data and lod do not match" assert [
new_recursive_seq_lens
] == recursive_seq_lens, "data and recursive_seq_lens do not match"
flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1]) flattened_data = flattened_data.reshape([len(flattened_data), 1])
return create_lod_tensor(flattened_data, lod, place) return create_lod_tensor(flattened_data, recursive_seq_lens, place)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(data, place) tensor.set(data, place)
tensor.set_recursive_sequence_lengths(lod) tensor.set_recursive_sequence_lengths(recursive_seq_lens)
assert tensor.has_valid_recursive_sequence_lengths( assert tensor.has_valid_recursive_sequence_lengths(
), "the provided lod info is invalid" ), "the provided lod info is invalid"
return tensor return tensor
...@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place): ...@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place):
"data should be either a LoDTensor, a Numpy array or a list") "data should be either a LoDTensor, a Numpy array or a list")
def create_random_int_lodtensor(lod, base_shape, place, low, high): def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
high):
""" """
Create a LoDTensor containing random integers. Create a LoDTensor containing random integers.
...@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high): ...@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
The function does the following: The function does the following:
1. Calculate the overall shape of the LoDTensor based on the length-based 1. Calculate the overall shape of the LoDTensor based on the length-based
:code:`lod` input and the shape of the basic element in :code:`recursive_seq_lens` input and the shape of the basic element in
:code:`base_shape`. :code:`base_shape`.
2. Create a numpy array of this shape. 2. Create a numpy array of this shape.
...@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high): ...@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
Suppose we want LoDTensor to hold data for sequences of word, where each Suppose we want LoDTensor to hold data for sequences of word, where each
word is represented by an integer. If we want to create a LoDTensor to word is represented by an integer. If we want to create a LoDTensor to
represent two sentences, one of 2 words, and one of 3 words. Then represent two sentences, one of 2 words, and one of 3 words. Then
'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall 'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
shape of the LoDTensor would be [5, 1], holding 5 words for two sentences. Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
for two sentences.
Args: Args:
lod(list): a list of lists indicating the length-based LoD info recursive_seq_lens(list): a list of lists indicating the length-based
specified by the user. level of detail info specified by the user.
base_shape(list): the shape of the basic element to be held by the base_shape(list): the shape of the basic element to be held by the
LoDTensor. LoDTensor.
place(Place): CPU or GPU place indicating where the data in the new place(Place): CPU or GPU place indicating where the data in the new
...@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high): ...@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
high(int): the upper bound of the random integers. high(int): the upper bound of the random integers.
Returns: Returns:
A fluid LoDTensor object with tensor data and lod info. A fluid LoDTensor object with tensor data and recursive_seq_lens info.
""" """
assert isinstance(base_shape, list), "base_shape should be a list" assert isinstance(base_shape, list), "base_shape should be a list"
# append the total number of basic elements to the front of its shape # append the total number of basic elements to the front of its shape
overall_shape = [sum(lod[-1])] + base_shape overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
# the range of integer data elements is [low, high] # the range of integer data elements is [low, high]
data = np.random.random_integers(low, high, overall_shape).astype("int64") data = np.random.random_integers(low, high, overall_shape).astype("int64")
return create_lod_tensor(data, lod, place) return create_lod_tensor(data, recursive_seq_lens, place)
...@@ -596,12 +596,12 @@ class Auc(MetricBase): ...@@ -596,12 +596,12 @@ class Auc(MetricBase):
tp, fn, tn, fp = 0, 0, 0, 0 tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels): for i, lbl in enumerate(labels):
if lbl: if lbl:
if predictions[i, 1] >= thresh: if preds[i, 1] >= thresh:
tp += 1 tp += 1
else: else:
fn += 1 fn += 1
else: else:
if predictions[i, 1] >= thresh: if preds[i, 1] >= thresh:
fp += 1 fp += 1
else: else:
tn += 1 tn += 1
......
...@@ -1113,7 +1113,6 @@ class ModelAverage(Optimizer): ...@@ -1113,7 +1113,6 @@ class ModelAverage(Optimizer):
Args: Args:
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
params_grads: A list of parameter-grad variable pairs.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
...@@ -1122,8 +1121,8 @@ class ModelAverage(Optimizer): ...@@ -1122,8 +1121,8 @@ class ModelAverage(Optimizer):
.. code-block:: python .. code-block:: python
optimizer = fluid.optimizer.Momentum() optimizer = fluid.optimizer.Momentum()
_, params_grads = optimizer.minimize(cost) optimizer.minimize(cost)
model_average = fluid.optimizer.ModelAverage(params_grads, 0.15, model_average = fluid.optimizer.ModelAverage(0.15,
min_average_window=10000, min_average_window=10000,
max_average_window=20000) max_average_window=20000)
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
...@@ -1137,7 +1136,6 @@ class ModelAverage(Optimizer): ...@@ -1137,7 +1136,6 @@ class ModelAverage(Optimizer):
def __init__(self, def __init__(self,
average_window_rate, average_window_rate,
params_grads=None,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): **kwargs):
...@@ -1146,21 +1144,16 @@ class ModelAverage(Optimizer): ...@@ -1146,21 +1144,16 @@ class ModelAverage(Optimizer):
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
self.params_grads = [] if params_grads is None else params_grads self.params_grads = []
params = {}
for param, grad in self.params_grads:
if param.do_model_average != False:
params[param.name] = (param, grad)
for param in framework.default_main_program().global_block( for param in framework.default_main_program().global_block(
).all_parameters(): ).all_parameters():
if param.name not in params and param.do_model_average != False: if param.do_model_average != False:
grad = param.block.create_var( grad = param.block.create_var(
name=unique_name.generate(".".join([param.name, 'tmp'])), name=unique_name.generate(".".join([param.name, 'tmp'])),
dtype=param.dtype, dtype=param.dtype,
persistable=False, persistable=False,
stop_gradient=True) stop_gradient=True)
params[param.name] = (param, grad) self.params_grads.append((param, grad))
self.params_grads = params.values()
for param, grad in self.params_grads: for param, grad in self.params_grads:
self._append_average_accumulate_op(param) self._append_average_accumulate_op(param)
......
...@@ -160,7 +160,7 @@ class ParallelExecutor(object): ...@@ -160,7 +160,7 @@ class ParallelExecutor(object):
build_strategy, num_trainers, trainer_id) build_strategy, num_trainers, trainer_id)
self.scope = scope self.scope = scope
def run(self, fetch_list, feed=None, feed_dict=None): def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
""" """
Run a parallel executor with fetch_list. Run a parallel executor with fetch_list.
...@@ -196,6 +196,8 @@ class ParallelExecutor(object): ...@@ -196,6 +196,8 @@ class ParallelExecutor(object):
to each device. Default None. to each device. Default None.
feed_dict: Alias for feed parameter, for backward compatibility. feed_dict: Alias for feed parameter, for backward compatibility.
This parameter has been deprecated. Default None. This parameter has been deprecated. Default None.
return_numpy(bool): Whether converts the fetched tensor to numpy.
Default: True.
Returns: Returns:
List: The fetched result list. List: The fetched result list.
...@@ -270,6 +272,9 @@ class ParallelExecutor(object): ...@@ -270,6 +272,9 @@ class ParallelExecutor(object):
if self.is_dist: if self.is_dist:
self.bcast_params() self.bcast_params()
if return_numpy:
return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self): def bcast_params(self):
......
...@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname): ...@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname):
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
inference_program, param_path=params_dirname, place=place) inference_program, param_path=params_dirname, place=place)
# Setup inputs by creating LoDTensors to represent sequences of words. # Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of these LoDTensors and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensors will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
word = fluid.create_random_int_lodtensor( word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n2 = fluid.create_random_int_lodtensor( ctx_n2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n1 = fluid.create_random_int_lodtensor( ctx_n1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_0 = fluid.create_random_int_lodtensor( ctx_0 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p1 = fluid.create_random_int_lodtensor( ctx_p1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p2 = fluid.create_random_int_lodtensor( ctx_p2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
pred = fluid.create_random_int_lodtensor( pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
mark = fluid.create_random_int_lodtensor( mark = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1) recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
results = inferencer.infer( results = inferencer.infer(
{ {
......
...@@ -127,9 +127,19 @@ def decode(context, is_sparse): ...@@ -127,9 +127,19 @@ def decode(context, is_sparse):
current_score = pd.fc(input=current_state_with_lod, current_score = pd.fc(input=current_state_with_lod,
size=target_dict_dim, size=target_dict_dim,
act='softmax') act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=topk_size) topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
# calculate accumulated scores after topk to reduce computation cost
accu_scores = pd.elementwise_add(
x=pd.log(topk_scores), y=pd.reshape(
pre_score, shape=[-1]), axis=0)
selected_ids, selected_scores = pd.beam_search( selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) pre_ids,
pre_score,
topk_indices,
accu_scores,
beam_size,
end_id=10,
level=0)
pd.increment(x=counter, value=1, in_place=True) pd.increment(x=counter, value=1, in_place=True)
...@@ -138,10 +148,14 @@ def decode(context, is_sparse): ...@@ -138,10 +148,14 @@ def decode(context, is_sparse):
pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter)
pd.array_write(selected_scores, array=scores_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter)
pd.less_than(x=counter, y=array_len, cond=cond) # update the break condition: up to the max length or all candidates of
# source sentences have ended.
length_cond = pd.less_than(x=counter, y=array_len)
finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
pd.logical_and(x=length_cond, y=finish_cond, out=cond)
translation_ids, translation_scores = pd.beam_search_decode( translation_ids, translation_scores = pd.beam_search_decode(
ids=ids_array, scores=scores_array) ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
# return init_ids, init_scores # return init_ids, init_scores
...@@ -215,11 +229,13 @@ def decode_main(use_cuda, is_sparse): ...@@ -215,11 +229,13 @@ def decode_main(use_cuda, is_sparse):
[1. for _ in range(batch_size)], dtype='float32') [1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1)) init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [1] * batch_size init_recursive_seq_lens = [1] * batch_size
init_lod = [init_lod, init_lod] init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) place)
init_scores = fluid.create_lod_tensor(init_scores_data,
init_recursive_seq_lens, place)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -243,7 +259,7 @@ def decode_main(use_cuda, is_sparse): ...@@ -243,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
feed=feed_dict, feed=feed_dict,
fetch_list=[translation_ids, translation_scores], fetch_list=[translation_ids, translation_scores],
return_numpy=False) return_numpy=False)
print result_ids.lod() print result_ids.recursive_sequence_lengths()
break break
......
...@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname): ...@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname):
inference_program, param_path=params_dirname, place=place) inference_program, param_path=params_dirname, place=place)
# Use the first data from paddle.dataset.movielens.test() as input. # Use the first data from paddle.dataset.movielens.test() as input.
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor, # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
# where `data` is a list of sequences of index numbers, `lod` is # to generate LoD Tensor where `data` is a list of sequences of index
# the level of detail (lod) info associated with `data`. # numbers, `recursive_sequence_lengths` is the length-based level of detail
# (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains # For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively. # two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info, # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# indicating that `data` consists of two sequences of length 3 and 2. # level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
user_id = fluid.create_lod_tensor([[1]], [[1]], place) user_id = fluid.create_lod_tensor([[1]], [[1]], place)
gender_id = fluid.create_lod_tensor([[1]], [[1]], place) gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
age_id = fluid.create_lod_tensor([[0]], [[1]], place) age_id = fluid.create_lod_tensor([[0]], [[1]], place)
......
...@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
# Here each word is the basic element of the LoDTensor and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor( tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1) recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words}) results = inferencer.infer({'words': tensor_words})
print("infer results: ", results) print("infer results: ", results)
......
...@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
# Here each word is the basic element of the LoDTensor and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor( tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1) recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words}) results = inferencer.infer({'words': tensor_words})
print("infer results: ", results) print("infer results: ", results)
......
...@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
# Here each word is the basic element of the LoDTensor and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor( tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=len(word_dict) - 1) recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
results = inferencer.infer({'words': tensor_words}) results = inferencer.infer({'words': tensor_words})
print("infer results: ", results) print("infer results: ", results)
......
...@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None):
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
# is simply an index to look up for the corresponding word vector and hence # is simply an index to look up for the corresponding word vector and hence
# the shape of word (base_shape) should be [1]. The length-based level of # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
# detail (lod) info of each LoDtensor should be [[1]] meaning there is only # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
# one lod_level and there is only one sequence of one word on this level. # meaning there is only one level of detail and there is only one sequence of
# Note that lod info should be a list of lists. # one word on this level.
lod = [[1]] # Note that recursive_sequence_lengths should be a list of lists.
recursive_seq_lens = [[1]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
first_word = fluid.create_random_int_lodtensor( first_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
second_word = fluid.create_random_int_lodtensor( second_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
third_word = fluid.create_random_int_lodtensor( third_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
fourth_word = fluid.create_random_int_lodtensor( fourth_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
result = inferencer.infer( result = inferencer.infer(
{ {
......
...@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None): ...@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
# Here each word is the basic element of the LoDTensor and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensor will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
tensor_words = fluid.create_random_int_lodtensor( tensor_words = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
...@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None): ...@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None):
feed={feed_target_names[0]: tensor_words}, feed={feed_target_names[0]: tensor_words},
fetch_list=fetch_targets, fetch_list=fetch_targets,
return_numpy=False) return_numpy=False)
print(results[0].lod()) print(results[0].recursive_sequence_lengths())
np_data = np.array(results[0]) np_data = np.array(results[0])
print("Inference Shape: ", np_data.shape) print("Inference Shape: ", np_data.shape)
print("Inference results: ", np_data) print("Inference results: ", np_data)
......
...@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None): ...@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None):
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# Setup inputs by creating LoDTensors to represent sequences of words. # Setup input by creating LoDTensor to represent sequence of words.
# Here each word is the basic element of these LoDTensors and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
# which has only one lod level. Then the created LoDTensors will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for three sentences of # element (word). Hence the LoDTensor will hold data for three sentences of
# length 3, 4 and 2, respectively. # length 3, 4 and 2, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[3, 4, 2]] recursive_seq_lens = [[3, 4, 2]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
word = fluid.create_random_int_lodtensor( word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
pred = fluid.create_random_int_lodtensor( pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=pred_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=pred_dict_len - 1)
ctx_n2 = fluid.create_random_int_lodtensor( ctx_n2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
ctx_n1 = fluid.create_random_int_lodtensor( ctx_n1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
ctx_0 = fluid.create_random_int_lodtensor( ctx_0 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
ctx_p1 = fluid.create_random_int_lodtensor( ctx_p1 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
ctx_p2 = fluid.create_random_int_lodtensor( ctx_p2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=word_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=word_dict_len - 1)
mark = fluid.create_random_int_lodtensor( mark = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=mark_dict_len - 1) recursive_seq_lens,
base_shape,
place,
low=0,
high=mark_dict_len - 1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
...@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None): ...@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
}, },
fetch_list=fetch_targets, fetch_list=fetch_targets,
return_numpy=False) return_numpy=False)
print(results[0].lod()) print(results[0].recursive_sequence_lengths())
np_data = np.array(results[0]) np_data = np.array(results[0])
print("Inference Shape: ", np_data.shape) print("Inference Shape: ", np_data.shape)
......
...@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse): ...@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
pre_state = pd.array_read(array=state_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter)
pre_score = pd.array_read(array=scores_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter)
# expand the lod of pre_state to be the same with pre_score # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
pre_ids_emb = pd.embedding( pre_ids_emb = pd.embedding(
...@@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse): ...@@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse):
current_score = pd.fc(input=current_state_with_lod, current_score = pd.fc(input=current_state_with_lod,
size=target_dict_dim, size=target_dict_dim,
act='softmax') act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=50) topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
# calculate accumulated scores after topk to reduce computation cost
accu_scores = pd.elementwise_add(
x=pd.log(topk_scores), y=pd.reshape(
pre_score, shape=[-1]), axis=0)
selected_ids, selected_scores = pd.beam_search( selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) pre_ids,
pre_score,
topk_indices,
accu_scores,
beam_size,
end_id=10,
level=0)
pd.increment(x=counter, value=1, in_place=True) pd.increment(x=counter, value=1, in_place=True)
...@@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse): ...@@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse):
pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter)
pd.array_write(selected_scores, array=scores_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter)
pd.less_than(x=counter, y=array_len, cond=cond) # update the break condition: up to the max length or all candidates of
# source sentences have ended.
length_cond = pd.less_than(x=counter, y=array_len)
finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
pd.logical_and(x=length_cond, y=finish_cond, out=cond)
translation_ids, translation_scores = pd.beam_search_decode( translation_ids, translation_scores = pd.beam_search_decode(
ids=ids_array, scores=scores_array) ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
# return init_ids, init_scores # return init_ids, init_scores
...@@ -238,11 +252,13 @@ def decode_main(use_cuda, is_sparse): ...@@ -238,11 +252,13 @@ def decode_main(use_cuda, is_sparse):
[1. for _ in range(batch_size)], dtype='float32') [1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1)) init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [1] * batch_size init_recursive_seq_lens = [1] * batch_size
init_lod = [init_lod, init_lod] init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) place)
init_scores = fluid.create_lod_tensor(init_scores_data,
init_recursive_seq_lens, place)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -266,7 +282,7 @@ def decode_main(use_cuda, is_sparse): ...@@ -266,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
feed=feed_dict, feed=feed_dict,
fetch_list=[translation_ids, translation_scores], fetch_list=[translation_ids, translation_scores],
return_numpy=False) return_numpy=False)
print result_ids.lod() print result_ids.recursive_sequence_lengths()
break break
......
...@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None): ...@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None):
# Use the first data from paddle.dataset.movielens.test() as input # Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id" assert feed_target_names[0] == "user_id"
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
# where `data` is a list of sequences of index numbers, `lod` is # to generate LoD Tensor where `data` is a list of sequences of index
# the level of detail (lod) info associated with `data`. # numbers, `recursive_sequence_lengths` is the length-based level of detail
# (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains # For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively. # two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info, # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# indicating that `data` consists of two sequences of length 3 and 2. # level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
user_id = fluid.create_lod_tensor([[1]], [[1]], place) user_id = fluid.create_lod_tensor([[1]], [[1]], place)
assert feed_target_names[1] == "gender_id" assert feed_target_names[1] == "gender_id"
......
...@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None): ...@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None):
# Here each word is the basic element of the LoDTensor and the shape of # Here each word is the basic element of the LoDTensor and the shape of
# each word (base_shape) should be [1] since it is simply an index to # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector. # look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[4, 6]], # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
# which has only one lod level. Then the created LoDTensor will have only # which has only one level of detail. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic # one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for two sentences of # element (word). Hence the LoDTensor will hold data for two sentences of
# length 4 and 6, respectively. # length 4 and 6, respectively.
# Note that lod info should be a list of lists. # Note that recursive_sequence_lengths should be a list of lists.
lod = [[4, 6]] recursive_seq_lens = [[4, 6]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
word_data = fluid.create_random_int_lodtensor( word_data = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=1) recursive_seq_lens, base_shape, place, low=0, high=1)
trg_word = fluid.create_random_int_lodtensor( trg_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=1) recursive_seq_lens, base_shape, place, low=0, high=1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
...@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None): ...@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None):
}, },
fetch_list=fetch_targets, fetch_list=fetch_targets,
return_numpy=False) return_numpy=False)
print(results[0].lod()) print(results[0].recursive_sequence_lengths())
np_data = np.array(results[0]) np_data = np.array(results[0])
print("Inference shape: ", np_data.shape) print("Inference shape: ", np_data.shape)
print("Inference results: ", np_data) print("Inference results: ", np_data)
......
...@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None): ...@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None):
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
# is simply an index to look up for the corresponding word vector and hence # is simply an index to look up for the corresponding word vector and hence
# the shape of word (base_shape) should be [1]. The length-based level of # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
# detail (lod) info of each LoDtensor should be [[1]] meaning there is only # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
# one lod_level and there is only one sequence of one word on this level. # meaning there is only one level of detail and there is only one sequence of
# Note that lod info should be a list of lists. # one word on this level.
lod = [[1]] # Note that recursive_sequence_lengths should be a list of lists.
recursive_seq_lens = [[1]]
base_shape = [1] base_shape = [1]
# The range of random integers is [low, high] # The range of random integers is [low, high]
first_word = fluid.create_random_int_lodtensor( first_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
second_word = fluid.create_random_int_lodtensor( second_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
third_word = fluid.create_random_int_lodtensor( third_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
fourth_word = fluid.create_random_int_lodtensor( fourth_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=dict_size - 1) recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
assert feed_target_names[0] == 'firstw' assert feed_target_names[0] == 'firstw'
assert feed_target_names[1] == 'secondw' assert feed_target_names[1] == 'secondw'
...@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None): ...@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
}, },
fetch_list=fetch_targets, fetch_list=fetch_targets,
return_numpy=False) return_numpy=False)
print(results[0].lod()) print(results[0].recursive_sequence_lengths())
np_data = np.array(results[0]) np_data = np.array(results[0])
print("Inference Shape: ", np_data.shape) print("Inference Shape: ", np_data.shape)
......
...@@ -19,18 +19,21 @@ import unittest ...@@ -19,18 +19,21 @@ import unittest
class TestLoDTensor(unittest.TestCase): class TestLoDTensor(unittest.TestCase):
def test_pybind_lod(self): def test_pybind_recursive_seq_lens(self):
tensor = fluid.LoDTensor() tensor = fluid.LoDTensor()
lod = [] recursive_seq_lens = []
tensor.set_recursive_sequence_lengths(lod) tensor.set_recursive_sequence_lengths(recursive_seq_lens)
lod = [[], [1], [3]] recursive_seq_lens = [[], [1], [3]]
self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod) self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
lod = [[0], [2], [3]] recursive_seq_lens)
self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod) recursive_seq_lens = [[0], [2], [3]]
self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
recursive_seq_lens)
lod = [[1, 2, 3]] recursive_seq_lens = [[1, 2, 3]]
tensor.set_recursive_sequence_lengths(lod) tensor.set_recursive_sequence_lengths(recursive_seq_lens)
self.assertEqual(tensor.recursive_sequence_lengths(), lod) self.assertEqual(tensor.recursive_sequence_lengths(),
recursive_seq_lens)
tensor.set(np.random.random([6, 1]), fluid.CPUPlace()) tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
self.assertTrue(tensor.has_valid_recursive_sequence_lengths()) self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
tensor.set(np.random.random([9, 1]), fluid.CPUPlace()) tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
...@@ -38,13 +41,14 @@ class TestLoDTensor(unittest.TestCase): ...@@ -38,13 +41,14 @@ class TestLoDTensor(unittest.TestCase):
# Each level's sum should be equal to the number of items in the next level # Each level's sum should be equal to the number of items in the next level
# Moreover, last level's sum should be equal to the tensor height # Moreover, last level's sum should be equal to the tensor height
lod = [[2, 3], [1, 3, 1, 2, 2]] recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
tensor.set_recursive_sequence_lengths(lod) tensor.set_recursive_sequence_lengths(recursive_seq_lens)
self.assertEqual(tensor.recursive_sequence_lengths(), lod) self.assertEqual(tensor.recursive_sequence_lengths(),
recursive_seq_lens)
tensor.set(np.random.random([8, 1]), fluid.CPUPlace()) tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
self.assertFalse(tensor.has_valid_recursive_sequence_lengths()) self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
lod = [[2, 3], [1, 3, 1, 2, 1]] recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
tensor.set_recursive_sequence_lengths(lod) tensor.set_recursive_sequence_lengths(recursive_seq_lens)
self.assertTrue(tensor.has_valid_recursive_sequence_lengths()) self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
tensor.set(np.random.random([9, 1]), fluid.CPUPlace()) tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
self.assertFalse(tensor.has_valid_recursive_sequence_lengths()) self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
...@@ -52,35 +56,42 @@ class TestLoDTensor(unittest.TestCase): ...@@ -52,35 +56,42 @@ class TestLoDTensor(unittest.TestCase):
def test_create_lod_tensor(self): def test_create_lod_tensor(self):
# Create LoDTensor from a list # Create LoDTensor from a list
data = [[1, 2, 3], [3, 4]] data = [[1, 2, 3], [3, 4]]
wrong_lod = [[2, 2]] wrong_recursive_seq_lens = [[2, 2]]
correct_lod = [[3, 2]] correct_recursive_seq_lens = [[3, 2]]
self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod, self.assertRaises(AssertionError, create_lod_tensor, data,
fluid.CPUPlace()) wrong_recursive_seq_lens, fluid.CPUPlace())
tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace()) tensor = create_lod_tensor(data, correct_recursive_seq_lens,
self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod) fluid.CPUPlace())
self.assertEqual(tensor.recursive_sequence_lengths(),
correct_recursive_seq_lens)
# Create LoDTensor from numpy array # Create LoDTensor from numpy array
data = np.random.random([10, 1]) data = np.random.random([10, 1])
lod = [[2, 1], [3, 3, 4]] recursive_seq_lens = [[2, 1], [3, 3, 4]]
tensor = create_lod_tensor(data, lod, fluid.CPUPlace()) tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
self.assertEqual(tensor.recursive_sequence_lengths(), lod) self.assertEqual(tensor.recursive_sequence_lengths(),
recursive_seq_lens)
# Create LoDTensor from another LoDTensor, they are differnt instances # Create LoDTensor from another LoDTensor, they are differnt instances
new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]] new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace()) new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
self.assertEqual(tensor.recursive_sequence_lengths(), lod) fluid.CPUPlace())
self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod) self.assertEqual(tensor.recursive_sequence_lengths(),
recursive_seq_lens)
self.assertEqual(new_tensor.recursive_sequence_lengths(),
new_recursive_seq_lens)
def test_create_random_int_lodtensor(self): def test_create_random_int_lodtensor(self):
# The shape of a word, commonly used in speech and NLP problem, is [1] # The shape of a word, commonly used in speech and NLP problem, is [1]
shape = [1] shape = [1]
lod = [[2, 3, 5]] recursive_seq_lens = [[2, 3, 5]]
dict_size = 10000 dict_size = 10000
low = 0 low = 0
high = dict_size - 1 high = dict_size - 1
tensor = create_random_int_lodtensor(lod, shape, tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
fluid.CPUPlace(), low, high) fluid.CPUPlace(), low, high)
self.assertEqual(tensor.recursive_sequence_lengths(), lod) self.assertEqual(tensor.recursive_sequence_lengths(),
recursive_seq_lens)
self.assertEqual(tensor.shape(), [10, 1]) self.assertEqual(tensor.shape(), [10, 1])
......
...@@ -51,3 +51,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL) ...@@ -51,3 +51,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
...@@ -18,6 +18,8 @@ import unittest ...@@ -18,6 +18,8 @@ import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import time import time
import numpy as np import numpy as np
import math
import sys
__all__ = ['TestParallelExecutorBase'] __all__ = ['TestParallelExecutorBase']
...@@ -81,7 +83,6 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -81,7 +83,6 @@ class TestParallelExecutorBase(unittest.TestCase):
begin = time.time() begin = time.time()
first_loss, = run_executor( first_loss, = run_executor(
exe=exe, feed=feed_dict, fetch_list=[loss.name]) exe=exe, feed=feed_dict, fetch_list=[loss.name])
first_loss = np.array(first_loss)
for i in xrange(iter): for i in xrange(iter):
run_executor(exe=exe, feed=feed_dict, fetch_list=[]) run_executor(exe=exe, feed=feed_dict, fetch_list=[])
...@@ -94,7 +95,11 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -94,7 +95,11 @@ class TestParallelExecutorBase(unittest.TestCase):
print "%.4f Instance per second" % ( print "%.4f Instance per second" % (
(batch_size * iter + 2) / (end - begin)) (batch_size * iter + 2) / (end - begin))
last_loss = np.array(last_loss) avg_last_loss_val = np.array(last_loss).mean()
avg_first_loss_val = np.array(first_loss).mean()
if math.isnan(float(avg_last_loss_val)) or math.isnan(
float(avg_first_loss_val)):
sys.exit("got NaN loss, training failed.")
print first_loss, last_loss print first_loss, last_loss
# self.assertGreater(first_loss[0], last_loss[0]) # self.assertGreater(first_loss[0], last_loss[0])
......
...@@ -20,7 +20,7 @@ from op_test import OpTest ...@@ -20,7 +20,7 @@ from op_test import OpTest
class TestArgsortOp(OpTest): class TestArgsortOp(OpTest):
def setUp(self): def setUp(self):
self.init_axis() self.init_axis()
x = np.random.random((2, 3, 4, 5)).astype("float32") x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
if self.axis < 0: if self.axis < 0:
self.axis = self.axis + len(x.shape) self.axis = self.axis + len(x.shape)
self.indices = np.argsort(x, kind='quicksort', axis=self.axis) self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
......
...@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference): ...@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
def init_kernel_type(self):
self.use_mkldnn = True
self.fuse_with_relu = True
def test_check_output(self):
place = core.CPUPlace()
data_format = "NCHW"
self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase):
def setUp(self): def setUp(self):
self.dtype = np.float32 self.dtype = np.float32
self.use_mkldnn = False self.use_mkldnn = False
self.fuse_with_relu = False
self.init_kernel_type() self.init_kernel_type()
def __assert_close(self, tensor, np_array, msg, atol=1e-4): def __assert_close(self, tensor, np_array, msg, atol=1e-4):
...@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase):
scale_shape = [c] scale_shape = [c]
x_val = np.random.random_sample(x_shape).astype(dtype) x_val = np.random.random_sample(x_shape).astype(dtype)
# generate some negative values to test case with relu fused
x_val = x_val - 0.5
scale_val = np.random.random_sample(scale_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32)
...@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase):
y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance, y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
epsilon, data_layout).astype(dtype) epsilon, data_layout).astype(dtype)
if self.fuse_with_relu:
y_out = np.maximum(y_out, 0)
scope = core.Scope() scope = core.Scope()
...@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase): ...@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase):
is_test=True, is_test=True,
data_layout=data_layout, data_layout=data_layout,
use_mkldnn=self.use_mkldnn, use_mkldnn=self.use_mkldnn,
fuse_with_relu=self.fuse_with_relu,
epsilon=epsilon) epsilon=epsilon)
batch_norm_op.run(scope, place) batch_norm_op.run(scope, place)
...@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
def setUp(self): def setUp(self):
self.dtype = np.float16 self.dtype = np.float16
self.use_mkldnn = False self.use_mkldnn = False
self.fuse_with_relu = False
self.init_kernel_type() self.init_kernel_type()
def test_check_output(self): def test_check_output(self):
...@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): ...@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
class TestBatchNormOpTraining(unittest.TestCase): class TestBatchNormOpTraining(unittest.TestCase):
def setUp(self): def setUp(self):
self.use_mkldnn = False self.use_mkldnn = False
self.fuse_with_relu = False
self.data_formats = ["NCHW", "NHWC"] self.data_formats = ["NCHW", "NHWC"]
self.init_kernel_type() self.init_kernel_type()
...@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase): ...@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
"epsilon": epsilon, "epsilon": epsilon,
"is_test": False, "is_test": False,
"data_layout": data_layout, "data_layout": data_layout,
"use_mkldnn": self.use_mkldnn "use_mkldnn": self.use_mkldnn,
"fuse_with_relu": self.fuse_with_relu
}) })
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
......
...@@ -20,44 +20,58 @@ from paddle.fluid.op import Operator ...@@ -20,44 +20,58 @@ from paddle.fluid.op import Operator
class TestBeamSearchDecodeOp(unittest.TestCase): class TestBeamSearchDecodeOp(unittest.TestCase):
"""unittest of beam_search_decode_op"""
def setUp(self): def setUp(self):
self.scope = core.Scope() self.scope = core.Scope()
self.place = core.CPUPlace() self.place = core.CPUPlace()
def append_lod_tensor(self, tensor_array, lod, data): def append_lod_tensor(self, tensor_array, lod, data):
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
lod_tensor.set_recursive_sequence_lengths(lod) lod_tensor.set_lod(lod)
lod_tensor.set(data, self.place) lod_tensor.set(data, self.place)
tensor_array.append(lod_tensor) tensor_array.append(lod_tensor)
def test_get_set(self): def test_get_set(self):
ids = self.scope.var("ids").get_lod_tensor_array() ids = self.scope.var("ids").get_lod_tensor_array()
self.append_lod_tensor(
ids, [[3, 3], [1, 1, 1, 1, 1, 1]],
np.array(
[1, 2, 3, 4, 5, 6], dtype="int64"))
self.append_lod_tensor(
ids, [[3, 3], [1, 0, 2, 2, 0, 1]],
np.array(
[0, 1, 2, 3, 4, 5], dtype="int64"))
self.append_lod_tensor(
ids, [[3, 3], [0, 1, 1, 1, 1, 1]],
np.array(
[0, 1, 2, 3, 4], dtype="int64"))
scores = self.scope.var("scores").get_lod_tensor_array() scores = self.scope.var("scores").get_lod_tensor_array()
self.append_lod_tensor( # Construct sample data with 5 steps and 2 source sentences
scores, [[3, 3], [1, 1, 1, 1, 1, 1]], # beam_size = 2, end_id = 1
np.array( # start with start_id
[1, 2, 3, 4, 5, 6], dtype="float64")) [
self.append_lod_tensor( self.append_lod_tensor(
scores, [[3, 3], [1, 0, 2, 2, 0, 1]], array, [[0, 1, 2], [0, 1, 2]], np.array(
np.array( [0, 0], dtype=dtype))
[0, 1, 2, 3, 4, 5], dtype="float64")) for array, dtype in ((ids, "int64"), (scores, "float32"))
self.append_lod_tensor( ]
scores, [[3, 3], [0, 1, 1, 1, 1, 1]], [
np.array( self.append_lod_tensor(
[0, 1, 2, 3, 4], dtype="float64")) array, [[0, 1, 2], [0, 2, 4]],
np.array(
[2, 3, 4, 5], dtype=dtype))
for array, dtype in ((ids, "int64"), (scores, "float32"))
]
[
self.append_lod_tensor(
array, [[0, 2, 4], [0, 2, 2, 4, 4]],
np.array(
[3, 1, 5, 4], dtype=dtype))
for array, dtype in ((ids, "int64"), (scores, "float32"))
]
[
self.append_lod_tensor(
array, [[0, 2, 4], [0, 1, 2, 3, 4]],
np.array(
[1, 1, 3, 5], dtype=dtype))
for array, dtype in ((ids, "int64"), (scores, "float32"))
]
[
self.append_lod_tensor(
array, [[0, 2, 4], [0, 0, 0, 2, 2]],
np.array(
[5, 1], dtype=dtype))
for array, dtype in ((ids, "int64"), (scores, "float32"))
]
sentence_ids = self.scope.var("sentence_ids").get_tensor() sentence_ids = self.scope.var("sentence_ids").get_tensor()
sentence_scores = self.scope.var("sentence_scores").get_tensor() sentence_scores = self.scope.var("sentence_scores").get_tensor()
...@@ -69,18 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase): ...@@ -69,18 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
Scores="scores", Scores="scores",
# outputs # outputs
SentenceIds="sentence_ids", SentenceIds="sentence_ids",
SentenceScores="sentence_scores") SentenceScores="sentence_scores",
beam_size=2,
end_id=1, )
beam_search_decode_op.run(self.scope, self.place) beam_search_decode_op.run(self.scope, self.place)
expected_lod = [[4, 4], [1, 2, 3, 3, 1, 3, 3, 3]] expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
self.assertEqual(sentence_ids.recursive_sequence_lengths(), self.assertEqual(sentence_ids.lod(), expected_lod)
expected_lod) self.assertEqual(sentence_scores.lod(), expected_lod)
self.assertEqual(sentence_scores.recursive_sequence_lengths(),
expected_lod)
expected_data = np.array( expected_data = np.array(
[2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64") [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
self.assertTrue( self.assertTrue(
np.array_equal(np.array(sentence_scores), expected_data)) np.array_equal(np.array(sentence_scores), expected_data))
......
...@@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data): ...@@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data):
class BeamSearchOpTester(unittest.TestCase): class BeamSearchOpTester(unittest.TestCase):
"""unittest of beam_search_op"""
def setUp(self): def setUp(self):
self.scope = core.Scope() self.scope = core.Scope()
self._create_ids() self._create_ids()
self._create_pre_scores()
self._create_scores() self._create_scores()
self._create_pre_ids() self._create_pre_ids()
self.scope.var('selected_ids') self.scope.var('selected_ids')
...@@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase):
def test_run(self): def test_run(self):
op = Operator( op = Operator(
'beam_search', 'beam_search',
pre_ids="pre_ids", pre_ids='pre_ids',
pre_scores='pre_scores',
ids='ids', ids='ids',
scores='scores', scores='scores',
selected_ids='selected_ids', selected_ids='selected_ids',
...@@ -47,19 +51,31 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -47,19 +51,31 @@ class BeamSearchOpTester(unittest.TestCase):
end_id=0, ) end_id=0, )
op.run(self.scope, core.CPUPlace()) op.run(self.scope, core.CPUPlace())
selected_ids = self.scope.find_var("selected_ids").get_tensor() selected_ids = self.scope.find_var("selected_ids").get_tensor()
print 'selected_ids', np.array(selected_ids) selected_scores = self.scope.find_var("selected_scores").get_tensor()
print 'lod', selected_ids.recursive_sequence_lengths() self.assertTrue(
np.allclose(
np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
self.assertTrue(
np.allclose(
np.array(selected_scores),
np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
self.assertEqual(selected_ids.lod(),
[[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
def _create_pre_ids(self): def _create_pre_ids(self):
np_data = np.array([[1, 2, 3, 4]], dtype='int64') np_data = np.array([[1, 2, 3, 4]], dtype='int64')
tensor = create_tensor(self.scope, "pre_ids", np_data) tensor = create_tensor(self.scope, 'pre_ids', np_data)
def _create_pre_scores(self):
np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
tensor = create_tensor(self.scope, 'pre_scores', np_data)
def _create_ids(self): def _create_ids(self):
self.lod = [[1, 3], [1, 1, 1, 1]] self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
np_data = np.array( np_data = np.array(
[[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64') [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
tensor = create_tensor(self.scope, "ids", np_data) tensor = create_tensor(self.scope, "ids", np_data)
tensor.set_recursive_sequence_lengths(self.lod) tensor.set_lod(self.lod)
def _create_scores(self): def _create_scores(self):
np_data = np.array( np_data = np.array(
...@@ -71,7 +87,7 @@ class BeamSearchOpTester(unittest.TestCase): ...@@ -71,7 +87,7 @@ class BeamSearchOpTester(unittest.TestCase):
], ],
dtype='float32') dtype='float32')
tensor = create_tensor(self.scope, "scores", np_data) tensor = create_tensor(self.scope, "scores", np_data)
tensor.set_recursive_sequence_lengths(self.lod) tensor.set_lod(self.lod)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -114,6 +114,23 @@ class TestBipartiteMatchOpWithoutLoD(OpTest): ...@@ -114,6 +114,23 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
self.check_output() self.check_output()
class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
def setUp(self):
self.op_type = 'bipartite_match'
lod = [[300]]
dist = np.random.random((300, 17)).astype('float32')
match_indices, match_dist = batch_bipartite_match(dist, lod[0])
self.inputs = {'DistMat': dist}
self.outputs = {
'ColToRowMatchIndices': match_indices,
'ColToRowMatchDist': match_dist,
}
def test_check_output(self):
self.check_output()
class TestBipartiteMatchOpWithPerPredictionType(OpTest): class TestBipartiteMatchOpWithPerPredictionType(OpTest):
def setUp(self): def setUp(self):
self.op_type = 'bipartite_match' self.op_type = 'bipartite_match'
......
...@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups): ...@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups):
self.op_type = "conv2d_transpose" self.op_type = "conv2d_transpose"
class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.dilations = [1, 1]
self.input_size = [2, 8, 16, 16] # NCHW
self.groups = 8
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [self.input_size[1], f_c, 4, 4]
self.op_type = "depthwise_conv2d_transpose"
# Please Don't remove the following code. # Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv. # Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation): # class TestCUDNNWithDilation(TestWithDilation):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import signal
SEED = 1
DTYPE = "float32"
paddle.dataset.mnist.fetch()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(batch_size):
# Input data
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# Train program
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
inference_program = fluid.default_main_program().clone()
# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
# Reader
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
opt.minimize(avg_cost)
return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
def run_pserver(pserver_endpoints, trainers, current_endpoint):
get_model(batch_size=20)
t = get_transpiler(0,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
class TestDistMnist(unittest.TestCase):
def setUp(self):
self._trainers = 1
self._pservers = 1
self._ps_endpoints = "127.0.0.1:9123"
def start_pserver(self, endpoint):
p = Process(
target=run_pserver,
args=(self._ps_endpoints, self._trainers, endpoint))
p.start()
return p.pid
def _wait_ps_ready(self, pid):
retry_times = 5
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(1)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def stop_pserver(self, pid):
os.kill(pid, signal.SIGTERM)
def test_with_place(self):
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
pserver_pid = self.start_pserver(self._ps_endpoints)
self._wait_ps_ready(pserver_pid)
self.run_trainer(p, 0)
self.stop_pserver(pserver_pid)
def run_trainer(self, place, trainer_id):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
batch_size=20)
t = get_transpiler(trainer_id,
fluid.default_main_program(), self._ps_endpoints,
self._trainers)
trainer_prog = t.get_trainer_program()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in xrange(10):
for batch_id, data in enumerate(train_reader()):
exe.run(trainer_prog, feed=feeder.feed(data))
if (batch_id + 1) % 10 == 0:
acc_set = []
avg_loss_set = []
for test_data in test_reader():
acc_np, avg_loss_np = exe.run(
program=test_program,
feed=feeder.feed(test_data),
fetch_list=[batch_acc, avg_cost])
acc_set.append(float(acc_np))
avg_loss_set.append(float(avg_loss_np))
# get test acc and loss
acc_val = np.array(acc_set).mean()
avg_loss_val = np.array(avg_loss_set).mean()
if float(acc_val
) > 0.8: # Smaller value to increase CI speed
return
else:
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
if math.isnan(float(avg_loss_val)):
assert ("got Nan loss, training failed.")
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_elementwise_add_op import *
'''
Some tests differ from the tests defined in test_elementwise_add_op.py
because MKLDNN does not support tensors of number of dimensions 3.
Such dimensions cause exceptions in MKLDNN reorder primitive.
'''
class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
self.out = np.add(self.x, self.y)
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(1).astype(self.dtype)
self.out = self.x + self.y
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(1, 1).astype(self.dtype)
self.out = self.x + self.y
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
def init_kernel_type(self):
self.use_mkldnn = True
class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(2).astype(self.dtype)
self.out = self.x + self.y.reshape(2, 1, 1, 1)
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(3).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 1, 1)
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
def init_input_output(self):
self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
self.y = np.random.rand(4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 1, 1, 4)
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_rowwise_add_0(
TestElementwiseAddOp_rowwise_add_0):
def init_input_output(self):
self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
self.y = np.random.rand(3, 4).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 3, 4, 1)
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_rowwise_add_1(
TestElementwiseAddOp_rowwise_add_1):
def init_kernel_type(self):
self.use_mkldnn = True
class TestMKLDNNElementwiseAddOp_channelwise_add(
TestElementwiseAddOp_channelwise_add):
def init_input_output(self):
self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
self.out = self.x + self.y
def init_kernel_type(self):
self.use_mkldnn = True
if __name__ == '__main__':
unittest.main()
...@@ -18,19 +18,23 @@ from op_test import OpTest ...@@ -18,19 +18,23 @@ from op_test import OpTest
class TestElementwiseAddOp(OpTest): class TestElementwiseAddOp(OpTest):
def init_kernel_type(self):
self.use_mkldnn = False
def setUp(self): def setUp(self):
self.op_type = "elementwise_add" self.op_type = "elementwise_add"
self.dtype = np.float32 self.dtype = np.float32
self.axis = -1 self.axis = -1
self.init_dtype() self.init_dtype()
self.init_input_output() self.init_input_output()
self.init_kernel_type()
self.init_axis() self.init_axis()
self.inputs = { self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y) 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
} }
self.attrs = {'axis': self.axis} self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': self.out} self.outputs = {'Out': self.out}
def test_check_output(self): def test_check_output(self):
......
...@@ -401,7 +401,7 @@ class TestBook(unittest.TestCase): ...@@ -401,7 +401,7 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_maxout(self): def test_crop(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
x = layers.data(name='x', shape=[3, 5], dtype="float32") x = layers.data(name='x', shape=[3, 5], dtype="float32")
...@@ -410,6 +410,24 @@ class TestBook(unittest.TestCase): ...@@ -410,6 +410,24 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(output) self.assertIsNotNone(output)
print(str(program)) print(str(program))
def test_mean_iou(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[16], dtype='float32')
y = layers.data(name='label', shape=[1], dtype='int64')
iou = layers.mean_iou(x, y, 2)
self.assertIsNotNone(iou)
print(str(program))
def test_argsort(self):
program = Program()
with program_guard(program):
data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
out, ids = layers.argsort(input=data, axis=1)
self.assertIsNotNone(out)
self.assertIsNotNone(ids)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -169,9 +169,8 @@ class TestCRFModel(unittest.TestCase): ...@@ -169,9 +169,8 @@ class TestCRFModel(unittest.TestCase):
data = train_data() data = train_data()
for i in xrange(10): for i in xrange(10):
cur_batch = next(data) cur_batch = next(data)
print map(np.array, print pe.run(feed=feeder.feed(cur_batch),
pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0]
fetch_list=[avg_cost.name]))[0]
@unittest.skip(reason="CI hangs") @unittest.skip(reason="CI hangs")
def test_update_sparse_parameter_all_reduce(self): def test_update_sparse_parameter_all_reduce(self):
......
...@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase): ...@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase):
fetch_list.append(k) fetch_list.append(k)
for data in train_inputs: for data in train_inputs:
ret = pe.run(fetch_list, feed=feeder.feed(data)) ret = pe.run(fetch_list,
feed=feeder.feed(data),
return_numpy=True)
for i in range(len(fetch_list)): for i in range(len(fetch_list)):
assert not math.isnan(np.sum(ret[i])) and \ assert not math.isnan(np.sum(ret[i])) and \
not math.isinf(np.sum(ret[i])) not math.isinf(np.sum(ret[i]))
...@@ -128,7 +130,7 @@ class TestFeedParallel(unittest.TestCase): ...@@ -128,7 +130,7 @@ class TestFeedParallel(unittest.TestCase):
use_cuda=use_cuda, loss_name=loss.name, main_program=main) use_cuda=use_cuda, loss_name=loss.name, main_program=main)
for batch_id, data in enumerate(reader()): for batch_id, data in enumerate(reader()):
loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
print batch_id, loss_np print batch_id, loss_np
if batch_id == 2: if batch_id == 2:
break break
......
...@@ -16,6 +16,8 @@ import paddle.fluid as fluid ...@@ -16,6 +16,8 @@ import paddle.fluid as fluid
import numpy as np import numpy as np
import unittest import unittest
import os import os
import sys
import math
def simple_fc_net(): def simple_fc_net():
...@@ -70,10 +72,17 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): ...@@ -70,10 +72,17 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
for i in xrange(5): for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed=feed_dict) test_loss, = test_exe.run([loss.name], feed=feed_dict)
test_loss = np.array(test_loss)
train_loss, = train_exe.run([loss.name], feed=feed_dict) train_loss, = train_exe.run([loss.name], feed=feed_dict)
train_loss = np.array(train_loss)
avg_test_loss_val = np.array(test_loss).mean()
if math.isnan(float(avg_test_loss_val)):
sys.exit("got NaN loss, testing failed.")
avg_train_loss_val = np.array(train_loss).mean()
if math.isnan(float(avg_train_loss_val)):
sys.exit("got NaN loss, training failed.")
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
train_loss, test_loss, atol=1e-8), train_loss, test_loss, atol=1e-8),
......
...@@ -119,27 +119,20 @@ class CheckpointConfig(object): ...@@ -119,27 +119,20 @@ class CheckpointConfig(object):
max_num_checkpoints=3, max_num_checkpoints=3,
epoch_interval=1, epoch_interval=1,
step_interval=10): step_interval=10):
if checkpoint_dir is None:
self.checkpoint_dir = os.getcwd()
else:
self.checkpoint_dir = checkpoint_dir
self.max_num_checkpoints = max_num_checkpoints
if epoch_interval < 1:
self.epoch_interval = 1
else:
self.epoch_interval = epoch_interval
if step_interval < 1: assert epoch_interval >= 1
self.step_interval = 10 assert step_interval >= 1
else:
self.step_interval = step_interval
self.checkpoint_dir = checkpoint_dir \
if checkpoint_dir is not None else os.getcwd()
self.max_num_checkpoints = max_num_checkpoints
self.epoch_interval = epoch_interval
self.step_interval = step_interval
self.epoch_id = 0 self.epoch_id = 0
self.step_id = 0 self.step_id = 0
self.load_serial = None self.load_serial = None
self.is_pserver = False self.pserver_id = None
self.lookup_table_name = None
def check_and_get_place(place): def check_and_get_place(place):
...@@ -290,13 +283,20 @@ class Trainer(object): ...@@ -290,13 +283,20 @@ class Trainer(object):
self.checkpoint_cfg.load_serial, self.checkpoint_cfg.load_serial,
self.startup_program) self.startup_program)
if not self.checkpoint_cfg.is_pserver: if not self.checkpoint_cfg.pserver_id:
epoch_id, step_id = io.load_trainer_args( epoch_id, step_id = io.load_trainer_args(
self.checkpoint_cfg.checkpoint_dir, self.checkpoint_cfg.checkpoint_dir,
self.checkpoint_cfg.load_serial, self.trainer_id, self.checkpoint_cfg.load_serial, self.trainer_id,
self._get_checkpoint_load_args()) self._get_checkpoint_load_args())
self.checkpoint_cfg.epoch_id = int(epoch_id) self.checkpoint_cfg.epoch_id = int(epoch_id)
self.checkpoint_cfg.step_id = int(step_id) self.checkpoint_cfg.step_id = int(step_id)
else:
if self.checkpoint_cfg.lookup_table_name:
io.load_lookup_table_vars(
exe, self.checkpoint_cfg.checkpoint_dir,
self.startup_program,
self.checkpoint_cfg.pserver_id,
self.checkpoint_cfg.lookup_table_name)
if param_path and os.path.isdir(param_path): if param_path and os.path.isdir(param_path):
# load params from param_path into scope # load params from param_path into scope
...@@ -315,7 +315,7 @@ class Trainer(object): ...@@ -315,7 +315,7 @@ class Trainer(object):
for ip in worker_ips.split(","): for ip in worker_ips.split(","):
worker_endpoints.append(':'.join([ip, port])) worker_endpoints.append(':'.join([ip, port]))
self.num_trainers = len(worker_endpoints) self.num_trainers = len(worker_endpoints)
current_endpoint = os.getenv("POD_IP") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
worker_endpoints.remove(current_endpoint) worker_endpoints.remove(current_endpoint)
# TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
# in ParallelExecutor to start # in ParallelExecutor to start
...@@ -366,7 +366,10 @@ class Trainer(object): ...@@ -366,7 +366,10 @@ class Trainer(object):
self.trainer_id, pservers=pserver_endpoints, trainers=trainers) self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
if training_role == "PSERVER": if training_role == "PSERVER":
if self.checkpoint_cfg: if self.checkpoint_cfg:
self.is_pserver = True pserver_id = eplist.index(current_endpoint)
self.checkpoint_cfg.pserver_id = pserver_id
if t.has_distributed_lookup_table:
self.checkpoint_cfg.lookup_table_name = t.table_name
self.train_program = t.get_pserver_program(current_endpoint) self.train_program = t.get_pserver_program(current_endpoint)
self.startup_program = t.get_startup_program(current_endpoint, self.startup_program = t.get_startup_program(current_endpoint,
...@@ -566,7 +569,8 @@ class Trainer(object): ...@@ -566,7 +569,8 @@ class Trainer(object):
def _save_checkpoint(self, epoch_id, step_id): def _save_checkpoint(self, epoch_id, step_id):
assert self.checkpoint_cfg assert self.checkpoint_cfg
if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0: if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
and step_id % self.checkpoint_cfg.step_interval == 0:
exe = executor.Executor(self.place) exe = executor.Executor(self.place)
io.save_checkpoint( io.save_checkpoint(
executor=exe, executor=exe,
......
...@@ -301,8 +301,8 @@ class DistributeTranspiler(object): ...@@ -301,8 +301,8 @@ class DistributeTranspiler(object):
Program: trainer side program. Program: trainer side program.
""" """
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
# FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.origin_program.__str__() self.origin_program.__str__()
return self.origin_program return self.origin_program
...@@ -383,11 +383,12 @@ class DistributeTranspiler(object): ...@@ -383,11 +383,12 @@ class DistributeTranspiler(object):
if self._is_adam_connected_op(op): if self._is_adam_connected_op(op):
global_ops.append(op) global_ops.append(op)
def __append_optimize_op__(op, block, grad_to_block_id, merged_var): def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
lr_ops):
if self._is_optimizer_op(op): if self._is_optimizer_op(op):
self._append_pserver_ops(block, op, endpoint, grad_to_block_id, self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
self.origin_program, merged_var) self.origin_program, merged_var)
else: elif op not in lr_ops:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
def __op_have_grad_input__(op): def __op_have_grad_input__(op):
...@@ -452,7 +453,7 @@ class DistributeTranspiler(object): ...@@ -452,7 +453,7 @@ class DistributeTranspiler(object):
# optimizer is connected to itself # optimizer is connected to itself
if ufind.is_connected(op, opt_op) and op not in global_ops: if ufind.is_connected(op, opt_op) and op not in global_ops:
__append_optimize_op__(op, per_opt_block, grad_to_block_id, __append_optimize_op__(op, per_opt_block, grad_to_block_id,
merged_var) merged_var, lr_ops)
# append global ops # append global ops
if global_ops: if global_ops:
...@@ -461,7 +462,7 @@ class DistributeTranspiler(object): ...@@ -461,7 +462,7 @@ class DistributeTranspiler(object):
optimize_blocks.append(opt_state_block) optimize_blocks.append(opt_state_block)
for glb_op in global_ops: for glb_op in global_ops:
__append_optimize_op__(glb_op, opt_state_block, __append_optimize_op__(glb_op, opt_state_block,
grad_to_block_id, None) grad_to_block_id, None, lr_ops)
# process distributed lookup_table # process distributed lookup_table
prefetch_var_name_to_block_id = [] prefetch_var_name_to_block_id = []
...@@ -471,6 +472,8 @@ class DistributeTranspiler(object): ...@@ -471,6 +472,8 @@ class DistributeTranspiler(object):
pserver_index, pserver_program, pre_block_idx, grad_to_block_id) pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
prefetch_var_name_to_block_id = self._create_prefetch_block( prefetch_var_name_to_block_id = self._create_prefetch_block(
pserver_index, pserver_program, table_opt_block) pserver_index, pserver_program, table_opt_block)
checkpoint_block_id = self._create_checkpoint_save_block(
pserver_program, table_opt_block.idx)
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
# not be executed, so it's safe to use optimize_block to hold the place # not be executed, so it's safe to use optimize_block to hold the place
...@@ -489,6 +492,7 @@ class DistributeTranspiler(object): ...@@ -489,6 +492,7 @@ class DistributeTranspiler(object):
if len(prefetch_var_name_to_block_id) > 0: if len(prefetch_var_name_to_block_id) > 0:
attrs['prefetch_var_name_to_block_id'] \ attrs['prefetch_var_name_to_block_id'] \
= prefetch_var_name_to_block_id = prefetch_var_name_to_block_id
attrs['checkpint_block_id'] = checkpoint_block_id
# step5 append the listen_and_serv op # step5 append the listen_and_serv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
...@@ -534,7 +538,6 @@ class DistributeTranspiler(object): ...@@ -534,7 +538,6 @@ class DistributeTranspiler(object):
# 2. rename op outputs # 2. rename op outputs
for op in orig_s_prog.global_block().ops: for op in orig_s_prog.global_block().ops:
new_inputs = dict()
new_outputs = dict() new_outputs = dict()
# do not append startup op if var is not on this pserver # do not append startup op if var is not on this pserver
op_on_pserver = False op_on_pserver = False
...@@ -910,6 +913,27 @@ class DistributeTranspiler(object): ...@@ -910,6 +913,27 @@ class DistributeTranspiler(object):
return table_opt_block return table_opt_block
def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
"""
create a new block to handle save checkpoint.
"""
import os
pserver_program.global_block().create_var(
name="kLookupTablePath",
persistable=True,
type=core.VarDesc.VarType.RAW)
checkpoint_save_block = pserver_program.create_block(pre_block_idx)
# this 'file_path' do not be used in save lookup table variable
checkpoint_save_block.append_op(
type='save',
inputs={'X': [self.table_name]},
outputs={},
attrs={'file_path': "none"})
return checkpoint_save_block.idx
def _create_vars_from_blocklist(self, def _create_vars_from_blocklist(self,
program, program,
block_list, block_list,
...@@ -1299,16 +1323,6 @@ class DistributeTranspiler(object): ...@@ -1299,16 +1323,6 @@ class DistributeTranspiler(object):
ufind.union(op1, op2) ufind.union(op1, op2)
return ufind return ufind
def _is_opt_role_op(self, op):
# NOTE: depend on oprole to find out whether this op is for
# optimize
op_maker = core.op_proto_and_checker_maker
optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
if op_maker.kOpRoleAttrName() in op.attrs and \
int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
return True
return False
def _is_optimizer_op(self, op): def _is_optimizer_op(self, op):
if "Param" in op.input_names and \ if "Param" in op.input_names and \
"LearningRate" in op.input_names: "LearningRate" in op.input_names:
...@@ -1399,7 +1413,10 @@ class DistributeTranspiler(object): ...@@ -1399,7 +1413,10 @@ class DistributeTranspiler(object):
params_grads = [] params_grads = []
origin_var_dict = self.origin_program.global_block().vars origin_var_dict = self.origin_program.global_block().vars
for op in block.ops: for op in block.ops:
if self._is_opt_role_op(op): # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
# or not, because all ops in optimizer sub-graph would
# sign the optimizer op role
if self._is_optimizer_op(op):
opt_ops.append(op) opt_ops.append(op)
# HACK(wuyi): if we find grad vars from input of optimize # HACK(wuyi): if we find grad vars from input of optimize
# ops, we may get the output of clip op. Use syntax "@GRAD" # ops, we may get the output of clip op. Use syntax "@GRAD"
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import numpy as np import numpy as np
from .. import core from .. import core
from ..framework import Program from ..framework import Program
...@@ -20,12 +21,15 @@ from ..executor import global_scope ...@@ -20,12 +21,15 @@ from ..executor import global_scope
class InferenceTranspiler: class InferenceTranspiler:
''' '''
Convert the fluid program to optimized inference program. Convert the fluid program to optimized inference program.
There are several optimizations, only fuse batch normalization is supported now. There are several optimizations:
- fuse convolution and batch normalization
- fuse batch normalization and relu (MKLDNN only)
Examples: Examples:
.. code-block:: python .. code-block:: python
# As InferenceTranspiler will modify the original program, # As InferenceTranspiler will modify the original program,
...@@ -54,19 +58,64 @@ class InferenceTranspiler: ...@@ -54,19 +58,64 @@ class InferenceTranspiler:
if not isinstance(scope, core.Scope): if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self.fuse_batch_norm(program, place, scope) self.fuse_batch_norm(program, place, scope)
self.fuse_relu_mkldnn(program)
def fuse_relu_mkldnn(self, program):
'''
Transpile the program by fused relu activation for MKLDNN program.
Relu activation following batch norm OP can be fused by adding
:math:`fuse_with_relu` attribute to batch norm OP.
The result of fuse is:
- before:
- batch_norm->relu->any_other_op
- after:
- batch_norm->any_other_op
:param program: program to transpile
:type program: Program
'''
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if not use_mkldnn:
return
self.block = program.block(0)
i = 0
while i < len(self.block.ops) - 1:
current_op = self.block.ops[i]
if current_op.type in ['batch_norm']:
next_op = self.block.ops[i + 1]
if next_op.type == 'relu':
# modify bnorm OP to include relu
current_op.set_attr("fuse_with_relu", True)
# remove relu OP
self.block.remove_op(i + 1)
i = i + 1
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def fuse_batch_norm(self, program, place, scope): def fuse_batch_norm(self, program, place, scope):
''' '''
Transpile the program by fused batch normalization. Transpile the program by fused batch normalization.
The batch normalization followed the convolution or fully connected layer The batch normalization followed the convolution or fully connected layer
can be integrated with them. Doing so will give us a forward acceleration, can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded. especially in environments like mobile or embedded.
For input :math:`X`: For input :math:`X`:
- Conv process: :math:`X = input * W + bias` - Conv process: :math:`X = input * W + bias`
- Batch norm process: :math:`X' = (X - mean) / std` - Batch norm process: :math:`X' = (X - mean) / std`
- Scale Process: :math:`Y = a * X' + b` - Scale Process: :math:`Y = a * X' + b`
After fuse into one operation: After fuse into one operation:
...@@ -76,17 +125,17 @@ class InferenceTranspiler: ...@@ -76,17 +125,17 @@ class InferenceTranspiler:
Y &= (input * W + bias - mean) / std * a + b \\\\ Y &= (input * W + bias - mean) / std * a + b \\\\
&= input * a * W / std + ((bias - mean) / std * a + b) &= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is: The operator transformation is:
- before: - before:
- conv->batch_norm->any_other_op (bias == 0) - conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0) - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after: - after:
- conv->elementwise_add->any_other_op - conv->elementwise_add->any_other_op
The transpile stages are: The transpile stages are:
1. insert elementwise_add op when bias == 0. 1. insert elementwise_add op when bias == 0.
...@@ -99,20 +148,20 @@ class InferenceTranspiler: ...@@ -99,20 +148,20 @@ class InferenceTranspiler:
program (Program): program to transpile program (Program): program to transpile
place (Place): inference place place (Place): inference place
scope (Scope): inference Scope scope (Scope): inference Scope
''' '''
self.scope = scope self.scope = scope
self.place = place self.place = place
self.block = program.block(0) self.block = program.block(0)
self.input_map = {} # store the input names should be adjusted self.input_map = {} # store the input names should be adjusted
i = 0 i = 0
while i < len(self.block.ops): while i < len(self.block.ops) - 2:
current_op = self.block.ops[i] current_op = self.block.ops[i]
# TODO(luotao1): consider only conv2d now. fc would be delt later. # TODO(luotao1): consider only conv2d now. fc would be delt later.
if current_op.type in ['conv2d']: if current_op.type in ['conv2d']:
# TODO(luotao1): consider single chain network now. # TODO(luotao1): consider single chain network now.
# For branch network, we counldn't use block.ops[i + 1] as # For branch network, we counldn't use block.ops[i + 1] as
# the judgment condition. # the judgment condition.
next_op = self.block.ops[i + 1] next_op = self.block.ops[i + 1]
# conv2d without bias # conv2d without bias
...@@ -137,17 +186,17 @@ class InferenceTranspiler: ...@@ -137,17 +186,17 @@ class InferenceTranspiler:
self._adjust_input() self._adjust_input()
self._remove_unused_var() self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force, # TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately. # since some large program.desc will not be flushed immediately.
# And a better solution will be considered later. # And a better solution will be considered later.
program = program.clone() program = program.clone()
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
def _insert_bias_op(self, index, current_op, bn_op): def _insert_bias_op(self, index, current_op, bn_op):
''' '''
Construct elementwise_add operator for adding bias Construct elementwise_add operator for adding bias
and insert it into program. and insert it into program.
:param index: insert location of bias_op :param index: insert location of bias_op
:type index: Int :type index: Int
:param current_op: current operator (conv or fc) :param current_op: current operator (conv or fc)
...@@ -175,14 +224,14 @@ class InferenceTranspiler: ...@@ -175,14 +224,14 @@ class InferenceTranspiler:
def _fuse_param(self, current_op, bn_op, bias_op, with_bias): def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
''' '''
fuse the batch_norm_op' parameters to current_op (conv or fc) fuse the batch_norm_op' parameters to current_op (conv or fc)
:param current_op: current operator (conv or fc) :param current_op: current operator (conv or fc)
:type current_op: Operator :type current_op: Operator
:param bn_op: batch norm operator :param bn_op: batch norm operator
:type bn_op: Operator :type bn_op: Operator
:param bias_op: elementwise_add operator for adding bias :param bias_op: elementwise_add operator for adding bias
:type bias_op: Operator :type bias_op: Operator
:param with_bias: If current operator has bias, with_bias = 1; otherwise 0. :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
:type with_bias: Int :type with_bias: Int
''' '''
......
...@@ -112,7 +112,7 @@ def fetch(): ...@@ -112,7 +112,7 @@ def fetch():
paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
def convert(path): def convert(path):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册